diff options
author | yvesf <yvesf-git@xapek.org> | 2010-12-01 17:16:12 +0100 |
---|---|---|
committer | yvesf <yvesf-git@xapek.org> | 2010-12-01 17:16:12 +0100 |
commit | 9b09da2a0d5806a161c9b25f62193be2d0c2eec1 (patch) | |
tree | de7a96f1aba6ec8b8305380df7494616aa2c2ed1 /indexer.py | |
parent | e69a04675e142d8618433a785ad33e8167ef99d2 (diff) | |
download | booksearch-9b09da2a0d5806a161c9b25f62193be2d0c2eec1.tar.gz booksearch-9b09da2a0d5806a161c9b25f62193be2d0c2eec1.zip |
add metadata extraction
Diffstat (limited to 'indexer.py')
-rw-r--r-- | indexer.py | 41 |
1 files changed, 31 insertions, 10 deletions
@@ -15,7 +15,8 @@ def pdf_extract_metadata(filepath): from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.converter import TextConverter from pdfminer.layout import LAParams - from lxml import etree + import lxml + import lxml.etree outbuf = StringIO.StringIO() rsrcmgr = PDFResourceManager() @@ -28,23 +29,42 @@ def pdf_extract_metadata(filepath): doc.initialize("") namespaces={ + "x":"adobe:ns:meta/", "dc":"http://purl.org/dc/elements/1.1/", "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "pdf":"http://ns.adobe.com/pdf/1.3/", } + "pdf":"http://ns.adobe.com/pdf/1.3/", + "xap":"http://ns.adobe.com/xap/1.0/", + "xmpMM":"http://ns.adobe.com/xap/1.0/mm/"} + + metadata = {} + def add_metadata(tree, name, xpath_expr): + d=tree.xpath(xpath_expr, namespaces=namespaces) + if d: + metadata[name] = "".join(d).strip() if doc.catalog.has_key("Metadata"): obj_ref = doc.catalog["Metadata"] obj_stream = obj_ref.resolve() if obj_stream.attrs['Subtype'].name == "XML": - obj_data = obj_stream.get_data() - if obj_data.endswith("\nf"): - obj_data = obj_data[:-len("\nf")] - print obj_data - tree = etree.parse(StringIO.StringIO(obj_data)) - print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text - return obj_data + try: + obj_data = obj_stream.get_data()[:-2] # XXX remove trailing chars + print obj_data.strip() + tree = lxml.etree.parse(StringIO.StringIO(obj_data)) + add_metadata(tree, "dc:title", "//rdf:Description/dc:title//*/text()") + add_metadata(tree, "dc:creator", "//rdf:Description/dc:creator//*/text()") + add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()") + add_metadata(tree, "xap:CreaterTool", "//rdf:Description/xap:CreatorTool/text()") + add_metadata(tree, "xap:ModifyDate", "//rdf:Description/xap:ModifyDate/text()") + add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()") + add_metadata(tree, "xap:MetadataDate", "//rdf:Description/xap:MetadataDate/text()") + add_metadata(tree, "pdf:Producer", "//rdf:Description/pdf:Producer/text()") + add_metadata(tree, "xmpMM:DocumentID", "//rdf:Description/xmpMM:DocumentID/text()") + add_metadata(tree, "xmpMM:InstanceID", "//rdf:Description/xmpMM:InstanceID/text()") + except lxml.etree.XMLSyntaxError,e: + print e else: - return None + pass + return metadata def pdf_extract_text_pdfminer(filepath): from pdfminer.pdfparser import PDFParser, PDFDocument @@ -167,6 +187,7 @@ def process_file(filepath): content=content) pagenumber += 1 + print str_format(u"{pid} commit", pid=os.getpid()) writer_book.commit() except KeyboardInterrupt: return "KeyboardInterrupt" |