.*)',
'desc' : '(?P.*?)
',
'writer' : 'Writer: * *(?P.*)',
'cover_artist' : 'Cover Artist: * *(?P.*)',
'penciller' : 'Penciller: * *(?P.*)',
'inker' : 'Inker: * *(?P.*)',
'letterer' : 'Letterer: * *(?P.*)',
'colorist' : 'Colorist: * *(?P.*)',
'genre' : 'Genre: * *(?P.*?)
',
'format' : 'Format: * *(?P.*?)
',
}
# Compile patterns objects
self.__regExpsPO = {}
for k, pattern in self.__regExps.iteritems():
self.__regExpsPO[k] = re.compile(pattern)
self.__domTree = BasicTellicoDOM()
def run(self, title):
"""
Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
to stdout (in tellico format) so that tellico can use it.
"""
self.__getMovie(title)
# Print results to stdout
self.__domTree.printXMLTree()
def __getHTMLContent(self, url):
"""
Fetch HTML data from url
"""
u = urllib2.urlopen(url)
self.__data = u.read()
u.close()
def __fetchMovieLinks(self):
"""
Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent()
that need to be parsed.
"""
matchList = re.findall(""".*?)">(?P.*?)""" % self.__basePath.replace('?', '\?'), self.__data)
if not matchList: return None
return matchList
def __fetchCover(self, path, delete = True):
"""
Fetch cover to /tmp. Returns base64 encoding of data.
The image is deleted if delete is True
"""
md5 = genMD5()
imObj = urllib2.urlopen(path.strip())
img = imObj.read()
imObj.close()
imgPath = "/tmp/%s.jpeg" % md5
try:
f = open(imgPath, 'w')
f.write(img)
f.close()
except:
print sys.stderr, "Error: could not write image into /tmp"
b64data = (md5 + '.jpeg', base64.encodestring(img))
# Delete temporary image
if delete:
try:
os.remove(imgPath)
except:
print sys.stderr, "Error: could not delete temporary image /tmp/%s.jpeg" % md5
return b64data
def __fetchMovieInfo(self, url):
"""
Looks for movie information
"""
self.__getHTMLContent(url)
# First grab picture data
imgMatch = re.search("""""" % self.__coverPath, self.__data)
if imgMatch:
imgPath = self.__coverPath + imgMatch.group('imgpath')
# Fetch cover and gets its base64 encoded data
b64img = self.__fetchCover(imgPath)
else:
b64img = None
# Now isolate data between ...
elements
# re.S sets DOTALL; it makes the "." special character match any character at all, including a newline
m = re.search("""""", self.__data, re.S)
self.__data = m.group('part')
matches = {}
data = {}
data['comments'] = []
data['artist'] = {}
# Default values
data['publisher'] = 'Dark Horse Comics'
data['language'] = 'English'
data['country'] = 'USA'
data['image'] = b64img
data['pub_year'] = NULLSTRING
for name, po in self.__regExpsPO.iteritems():
data[name] = NULLSTRING
if name == 'desc':
matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I)
else:
matches[name] = po.search(self.__data)
if matches[name]:
if name == 'title':
title = matches[name].group('title').strip()
data[name] = title
# Look for issue information
m = re.search("#(?P[0-9]+)", title)
if m:
data['issue'] = m.group('issue')
else:
data['issue'] = ''
elif name == 'pub_date':
pub_date = matches[name].group('pub_date').strip()
data['pub_year'] = pub_date[-4:]
# Add this to comments field
data['comments'].insert(0, "Pub. Date: %s" % pub_date)
elif name == 'desc':
# Find biggest size
max = 0
for i in range(len(matches[name])):
if len(matches[name][i]) > len(matches[name][max]):
max = i
data['comments'].append(matches[name][max].strip())
elif name == 'writer':
# We may find several writers
data[name] = []
writersList = re.sub('?a.*?>', '', matches[name].group('writer')).split(',')
for d in writersList:
data[name].append(d.strip())
elif name == 'cover_artist':
data['artist']['Cover Artist'] = matches[name].group('cover_artist').strip()
elif name == 'penciller':
data['artist']['Penciller'] = matches[name].group('penciller').strip()
elif name == 'inker':
data['artist']['Inker'] = matches[name].group('inker').strip()
elif name == 'colorist':
data['artist']['Colorist'] = matches[name].group('colorist').strip()
elif name == 'letterer':
data['artist']['Letterer'] = matches[name].group('letterer').strip()
elif name == 'genre':
# We may find several genres
data[name] = []
genresList = re.sub('?a.*?>', '', matches[name].group('genre')).split(',')
for d in genresList:
data[name].append(d.strip())
elif name == 'format':
format = matches[name].group('format').strip()
data['comments'].insert(1, format)
m = re.search("(?P[0-9]+)", format)
if m:
data['pages'] = m.group('pages')
else:
data['pages'] = ''
return data
def __getMovie(self, title):
if not len(title): return
self.__title = title
self.__getHTMLContent("%s%s" % (self.__baseURL, self.__searchURL % urllib.quote(self.__title)))
# Get all links
links = self.__fetchMovieLinks()
# Now retrieve infos
if links:
for entry in links:
data = self.__fetchMovieInfo( url = self.__movieURL + entry[0] )
# Add DC link (custom field)
data['darkhorse'] = "%s%s" % (self.__movieURL, entry[0])
node = self.__domTree.addEntry(data)
# Print entries on-the-fly
#self.__domTree.printEntry(node)
else:
return None
def halt():
print "HALT."
sys.exit(0)
def showUsage():
print "Usage: %s comic" % sys.argv[0]
sys.exit(1)
def main():
if len(sys.argv) < 2:
showUsage()
parser = DarkHorseParser()
parser.run(sys.argv[1])
if __name__ == '__main__':
main()