tellico/src/fetch/scripts/fr.allocine.py

#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

# ***************************************************************************
#    copyright            : (C) 2006 by Mathias Monnerville
#    email                : tellico@monnerville.com
# ***************************************************************************
#
# ***************************************************************************
# *                                                                         *
# *   This program is free software; you can redistribute it and/or modify  *
# *   it under the terms of version 2 of the GNU General Public License as  *
# *   published by the Free Software Foundation;                            *
# *                                                                         *
# ***************************************************************************

# Version 0.4: 2007-08-27
# * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres 
# could not be retrieved. Fixed bad http request error due to some changes in HTML code.
#
# Version 0.3:
# * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed.
# 
# Version 0.2:
# * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore.
# 
# Version 0.1:
# * Initial release.

import sys, os, re, md5, random
import urllib, urllib2, time, base64
import xml.dom.minidom

XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""

VERSION = "0.4"

def genMD5():
	obj = md5.new()
	float = random.random()
	obj.update(str(float))
	return obj.hexdigest()

class BasicTellicoDOM:
	def __init__(self):
		self.__doc = xml.dom.minidom.Document()
		self.__root = self.__doc.createElement('tellico')
		self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
		self.__root.setAttribute('syntaxVersion', '9')
		
		self.__collection = self.__doc.createElement('collection')
		self.__collection.setAttribute('title', 'My Movies')
		self.__collection.setAttribute('type', '3')
		
		self.__fields = self.__doc.createElement('fields')
		# Add all default (standard) fields
		self.__dfltField = self.__doc.createElement('field')
		self.__dfltField.setAttribute('name', '_default')
		
		# Add a custom 'Collection' field
		self.__customField = self.__doc.createElement('field')
		self.__customField.setAttribute('name', 'titre-original')
		self.__customField.setAttribute('title', 'Original Title')
		self.__customField.setAttribute('flags', '8')
		self.__customField.setAttribute('category', 'General')
		self.__customField.setAttribute('format', '1')
		self.__customField.setAttribute('type', '1')
		self.__customField.setAttribute('i18n', 'yes')
		
		self.__fields.appendChild(self.__dfltField)
		self.__fields.appendChild(self.__customField)
		self.__collection.appendChild(self.__fields)

		self.__images = self.__doc.createElement('images')

		self.__root.appendChild(self.__collection)
		self.__doc.appendChild(self.__root)

		# Current movie id
		self.__currentId = 0


	def addEntry(self, movieData):
		"""
		Add a movie entry
		"""
		d = movieData
		entryNode = self.__doc.createElement('entry')
		entryNode.setAttribute('id', str(self.__currentId))

		titleNode = self.__doc.createElement('title')
		titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8')))

		otitleNode = self.__doc.createElement('titre-original')
		otitleNode.appendChild(self.__doc.createTextNode(unicode(d['otitle'], 'latin-1').encode('utf-8')))

		yearNode = self.__doc.createElement('year')
		yearNode.appendChild(self.__doc.createTextNode(unicode(d['year'], 'latin-1').encode('utf-8')))

		genresNode = self.__doc.createElement('genres')
		for g in d['genres']:
			genreNode = self.__doc.createElement('genre')
			genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
			genresNode.appendChild(genreNode)

		natsNode = self.__doc.createElement('nationalitys')
		natNode = self.__doc.createElement('nat')
		natNode.appendChild(self.__doc.createTextNode(unicode(d['nat'], 'latin-1').encode('utf-8')))
		natsNode.appendChild(natNode)

		castsNode = self.__doc.createElement('casts')
		for g in d['actors']:
			castNode = self.__doc.createElement('cast')
			col1Node = self.__doc.createElement('column')
			col2Node = self.__doc.createElement('column')
			col1Node.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
			castNode.appendChild(col1Node)
			castNode.appendChild(col2Node)
			castsNode.appendChild(castNode)

		dirsNode = self.__doc.createElement('directors')
		for g in d['dirs']:
			dirNode = self.__doc.createElement('director')
			dirNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
			dirsNode.appendChild(dirNode)

		timeNode = self.__doc.createElement('running-time')
		timeNode.appendChild(self.__doc.createTextNode(unicode(d['time'], 'latin-1').encode('utf-8')))

		allocineNode = self.__doc.createElement(unicode('allocin<EFBFBD>-link', 'latin-1').encode('utf-8'))
		allocineNode.appendChild(self.__doc.createTextNode(unicode(d['allocine'], 'latin-1').encode('utf-8')))

		plotNode = self.__doc.createElement('plot')
		plotNode.appendChild(self.__doc.createTextNode(unicode(d['plot'], 'latin-1').encode('utf-8')))

		if d['image']:
			imageNode = self.__doc.createElement('image')
			imageNode.setAttribute('format', 'JPEG')
			imageNode.setAttribute('id', d['image'][0])
			imageNode.setAttribute('width', '120')
			imageNode.setAttribute('height', '160')
			imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8')))

			coverNode = self.__doc.createElement('cover')
			coverNode.appendChild(self.__doc.createTextNode(d['image'][0]))

		for name in (	'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'natsNode', 
						'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode' ):
			entryNode.appendChild(eval(name))

		if d['image']:
			entryNode.appendChild(coverNode)
			self.__images.appendChild(imageNode)

		self.__collection.appendChild(entryNode)
		
		self.__currentId += 1

	def printXML(self):
		"""
		Outputs XML content to stdout
		"""
		self.__collection.appendChild(self.__images)
		print XML_HEADER; print DOCTYPE
		print self.__root.toxml()


class AlloCineParser:
	def __init__(self):
		self.__baseURL 	= 'http://www.allocine.fr'
		self.__basePath = '/film/fichefilm_gen_cfilm'
		self.__searchURL= 'http://www.allocine.fr/recherche/?motcle=%s&f=3&rub=1'
		self.__movieURL = self.__baseURL + self.__basePath

		# Define some regexps
		self.__regExps = { 	'title' 	: '<title>(?P<title>.+?)</title>',
							'dirs'		: 'R<EFBFBD>alis<EFBFBD> par <a.*?>(?P<step1>.+?)</a>.*?</h4>',
							'actors' 	: '<h4>Avec *<a.*?>(?P<step1>.+)</a> &nbsp;',
							'nat' 		: '<h4>Film *(?P<nat>.+?)[,\.]',
							'genres' 	: '<h4>Genre *: *<a.*?>(?P<step1>.+?)</a></h4>',
							'time' 		: '<h4>Dur<75>e *: *(?P<hours>[0-9])?h *(?P<mins>[0-9]{1,2})min',
							'year' 		: 'Ann<EFBFBD>e de production *: *(?P<year>[0-9]{4})',
							# Original movie title
							'otitle' 	: 'Titre original *: *<i>(?P<otitle>.+?)</i>',
							'plot'		: """(?s)<td valign="top" style="padding:10 0 0 0"><div align="justify"><h4> *(?P<plot>.+?) *</h4>""",
							'image'		: """<td valign="top" width="120".*?<img src="(?P<image>.+?)" border"""}
							

		self.__domTree = BasicTellicoDOM()

	def run(self, title):
		"""
		Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
		to stdout (in tellico format) so that tellico can use it.
		"""
		self.__getMovie(title)
		# Print results to stdout
		self.__domTree.printXML()

	def __getHTMLContent(self, url):
		"""
		Fetch HTML data from url
		"""

		u = urllib2.urlopen(url)
		self.__data = u.read()
		u.close()

	def __fetchMovieLinks(self):
		"""
		Retrieve all links related to movie
		"""
		matchList = re.findall("""<h4><a *href="%s=(?P<page>.*?\.html?)" *class="link1">(?P<title>.*?)</a>""" % self.__basePath, self.__data)
		if not matchList: return None

		return matchList

	def __fetchMovieInfo(self, url):
		"""
		Looks for movie information
		"""
		self.__getHTMLContent(url)

		matches = data = {}

		for name, regexp in self.__regExps.iteritems():
			if name == 'image':
				matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I)
			else:
				matches[name] = re.search(regexp, self.__data)

			if matches[name]:
				if name == 'title':
					data[name] = matches[name].group('title').strip()
				elif name == 'dirs':
					dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
					data[name] = []
					for d in dirsList:
						data[name].append(d.strip())

				elif name == 'actors':
					actorsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
					data[name] = []
					for d in actorsList:
						data[name].append(d.strip())

				elif name == 'nat':
					data[name] = matches[name].group('nat').strip()

				elif name == 'genres':
					genresList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
					data[name] = []
					for d in genresList:
						data[name].append(d.strip())

				elif name == 'time':
					h, m = matches[name].group('hours'), matches[name].group('mins')
					totmin = int(h)*60+int(m)
					data[name] = str(totmin)

				elif name == 'year':
					data[name] = matches[name].group('year').strip()

				elif name == 'otitle':
					data[name] = matches[name].group('otitle').strip()

				elif name == 'plot':
					data[name] = matches[name].group('plot').strip()

				# Image path
				elif name == 'image':
					# Save image to a temporary folder
					md5 = genMD5()
					imObj = urllib2.urlopen(matches[name][0].strip())
					img = imObj.read()
					imObj.close()
					imgPath = "/tmp/%s.jpeg" % md5
					try:
						f = open(imgPath, 'w')
						f.write(img)
						f.close()
					except:
						# Could be great if we can pass exit code and some message
						# to tellico in case of failure...
						pass

					data[name] = (md5 + '.jpeg', base64.encodestring(img))
					# Delete temporary image
					try:
						os.remove(imgPath)
					except:
						# Could be great if we can pass exit code and some msg
						# to tellico in case of failure...
						pass
			else:
				matches[name] = ''

		return data


	def __getMovie(self, title):
		if not len(title): return

		self.__title = title
		self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title))

		# Get all links
		links = self.__fetchMovieLinks()

		# Now retrieve infos
		if links:
			for entry in links:
				data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]) )
				# Add allocine link (custom field)
				data['allocine'] = "%s=%s" % (self.__movieURL, entry[0])
				self.__domTree.addEntry(data)
		else:
			return None


def showUsage():
	print "Usage: %s movietitle" % sys.argv[0]
	sys.exit(1)

def main():
	if len(sys.argv) < 2:
		showUsage()

	parser = AlloCineParser()
	parser.run(sys.argv[1])

if __name__ == '__main__':
	main()