From 15f17b4fce5a1e4b70709aac569c18d8cf57deba Mon Sep 17 00:00:00 2001 From: yvesf Date: Fri, 26 Nov 2010 23:59:59 +0100 Subject: add pdfminer for text extraction; replaced all ' with " metadata hacking --- indexer.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 10 deletions(-) (limited to 'indexer.py') diff --git a/indexer.py b/indexer.py index 913591a..3ec1e8b 100644 --- a/indexer.py +++ b/indexer.py @@ -3,11 +3,90 @@ import os import sys import time -import pyPdf import whoosh.index as index import whoosh.writing as writing import whoosh.fields as fields from compat import str_format +import StringIO + + +def pdf_extract_metadata(filepath): + from pdfminer.pdfparser import PDFParser, PDFDocument + from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.converter import TextConverter + from pdfminer.layout import LAParams + from lxml import etree + + outbuf = StringIO.StringIO() + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams()) + parser = PDFParser(file(filepath, "rb")) + doc = PDFDocument() + + parser.set_document(doc) + doc.set_parser(parser) + doc.initialize("") + + namespaces={ + "dc":"http://purl.org/dc/elements/1.1/", + "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "pdf":"http://ns.adobe.com/pdf/1.3/", } + + if doc.catalog.has_key("Metadata"): + obj_ref = doc.catalog["Metadata"] + obj_stream = obj_ref.resolve() + if obj_stream.attrs['Subtype'].name == "XML": + obj_data = obj_stream.get_data() + if obj_data.endswith("\nf"): + obj_data = obj_data[:-len("\nf")] + print obj_data + tree = etree.parse(StringIO.StringIO(obj_data)) + print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text + return obj_data + else: + return None + +def pdf_extract_text_pdfminer(filepath): + from pdfminer.pdfparser import PDFParser, PDFDocument + from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.converter import TextConverter + from pdfminer.layout import LAParams + + outbuf = StringIO.StringIO() + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams()) + parser = PDFParser(file(filepath, "rb")) + doc = PDFDocument() + + parser.set_document(doc) + doc.set_parser(parser) + doc.initialize("") + + interpreter = PDFPageInterpreter(rsrcmgr, device) + for pagenumber, page in enumerate(doc.get_pages()): + interpreter.process_page(page) + outbuf.seek(0) + content = unicode(outbuf.read(),"utf-8",errors="replace") + yield (pagenumber+1, content) #start pages at 1 + outbuf.seek(0) + +def pdf_extract_text_pypdf(filepath): + import pyPdf + inputfile = pyPdf.PdfFileReader(file(filepath, "r")) + + pagenumber = 1 + for page in inputfile.pages: + content = page.extractText() + yield (pagenumber, content) + pagenumber += 1 + +""" Yields (pagenumber, text) """ +def pdf_extract_text(filepath): + try: + return pdf_extract_text_pdfminer(filepath) + except ImportError: + print "Fallback to pypdf" + return pdf_extract_text_pypdf(filepath) schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), @@ -56,11 +135,11 @@ if not create_index: #update index for deleted files deleted = 0 processed = 0 for fields in searcher_metadata.all_stored_fields(): - path = fields['path'] + path = fields["path"] processed += 1 if not os.path.exists(path): - writer_book.delete_by_term(u'path', path) - writer_metadata.delete_by_term('path', path) + writer_book.delete_by_term(u"path", path) + writer_metadata.delete_by_term("path", path) deleted += 1 print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted), print "" @@ -73,8 +152,7 @@ searcher_metadata.close() def process_file(filepath): try: print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title + title = u"notimplemented" writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) @@ -82,9 +160,8 @@ def process_file(filepath): writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 - for page in inputfile.pages: + for pagenumber, content in pdf_extract_text(filepath): print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber) - content = page.extractText() writer_book.add_document(pagenumber=pagenumber, path=filepath, content=content) @@ -92,9 +169,9 @@ def process_file(filepath): writer_book.commit() except KeyboardInterrupt: - return 'KeyboardInterrupt' + return "KeyboardInterrupt" except Exception,e: - print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e) + print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=str(e)) try: import multiprocessing as mp -- cgit v1.2.1