From 9b09da2a0d5806a161c9b25f62193be2d0c2eec1 Mon Sep 17 00:00:00 2001 From: yvesf Date: Wed, 1 Dec 2010 17:16:12 +0100 Subject: add metadata extraction --- indexer.py | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'indexer.py') diff --git a/indexer.py b/indexer.py index 3ec1e8b..c909f68 100644 --- a/indexer.py +++ b/indexer.py @@ -15,7 +15,8 @@ def pdf_extract_metadata(filepath): from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.converter import TextConverter from pdfminer.layout import LAParams - from lxml import etree + import lxml + import lxml.etree outbuf = StringIO.StringIO() rsrcmgr = PDFResourceManager() @@ -28,23 +29,42 @@ def pdf_extract_metadata(filepath): doc.initialize("") namespaces={ + "x":"adobe:ns:meta/", "dc":"http://purl.org/dc/elements/1.1/", "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "pdf":"http://ns.adobe.com/pdf/1.3/", } + "pdf":"http://ns.adobe.com/pdf/1.3/", + "xap":"http://ns.adobe.com/xap/1.0/", + "xmpMM":"http://ns.adobe.com/xap/1.0/mm/"} + + metadata = {} + def add_metadata(tree, name, xpath_expr): + d=tree.xpath(xpath_expr, namespaces=namespaces) + if d: + metadata[name] = "".join(d).strip() if doc.catalog.has_key("Metadata"): obj_ref = doc.catalog["Metadata"] obj_stream = obj_ref.resolve() if obj_stream.attrs['Subtype'].name == "XML": - obj_data = obj_stream.get_data() - if obj_data.endswith("\nf"): - obj_data = obj_data[:-len("\nf")] - print obj_data - tree = etree.parse(StringIO.StringIO(obj_data)) - print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text - return obj_data + try: + obj_data = obj_stream.get_data()[:-2] # XXX remove trailing chars + print obj_data.strip() + tree = lxml.etree.parse(StringIO.StringIO(obj_data)) + add_metadata(tree, "dc:title", "//rdf:Description/dc:title//*/text()") + add_metadata(tree, "dc:creator", "//rdf:Description/dc:creator//*/text()") + add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()") + add_metadata(tree, "xap:CreaterTool", "//rdf:Description/xap:CreatorTool/text()") + add_metadata(tree, "xap:ModifyDate", "//rdf:Description/xap:ModifyDate/text()") + add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()") + add_metadata(tree, "xap:MetadataDate", "//rdf:Description/xap:MetadataDate/text()") + add_metadata(tree, "pdf:Producer", "//rdf:Description/pdf:Producer/text()") + add_metadata(tree, "xmpMM:DocumentID", "//rdf:Description/xmpMM:DocumentID/text()") + add_metadata(tree, "xmpMM:InstanceID", "//rdf:Description/xmpMM:InstanceID/text()") + except lxml.etree.XMLSyntaxError,e: + print e else: - return None + pass + return metadata def pdf_extract_text_pdfminer(filepath): from pdfminer.pdfparser import PDFParser, PDFDocument @@ -167,6 +187,7 @@ def process_file(filepath): content=content) pagenumber += 1 + print str_format(u"{pid} commit", pid=os.getpid()) writer_book.commit() except KeyboardInterrupt: return "KeyboardInterrupt" -- cgit v1.2.1