#!/usr/bin/python2.6 # coding: utf-8 import os import sys import time import whoosh.index as index import whoosh.writing as writing import whoosh.fields as fields from compat import str_format import StringIO def pdf_extract_metadata(filepath): from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.converter import TextConverter from pdfminer.layout import LAParams import lxml import lxml.etree outbuf = StringIO.StringIO() rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams()) parser = PDFParser(file(filepath, "rb")) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") namespaces={ "x":"adobe:ns:meta/", "dc":"http://purl.org/dc/elements/1.1/", "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#", "pdf":"http://ns.adobe.com/pdf/1.3/", "xap":"http://ns.adobe.com/xap/1.0/", "xmpMM":"http://ns.adobe.com/xap/1.0/mm/"} metadata = {} def add_metadata(tree, name, xpath_expr): d=tree.xpath(xpath_expr, namespaces=namespaces) if d: metadata[name] = "".join(d).strip() if doc.catalog.has_key("Metadata"): obj_ref = doc.catalog["Metadata"] obj_stream = obj_ref.resolve() if obj_stream.attrs['Subtype'].name == "XML": try: obj_data = obj_stream.get_data()[:-2] # XXX remove trailing chars print obj_data.strip() tree = lxml.etree.parse(StringIO.StringIO(obj_data)) add_metadata(tree, "dc:title", "//rdf:Description/dc:title//*/text()") add_metadata(tree, "dc:creator", "//rdf:Description/dc:creator//*/text()") add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()") add_metadata(tree, "xap:CreaterTool", "//rdf:Description/xap:CreatorTool/text()") add_metadata(tree, "xap:ModifyDate", "//rdf:Description/xap:ModifyDate/text()") add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()") add_metadata(tree, "xap:MetadataDate", "//rdf:Description/xap:MetadataDate/text()") add_metadata(tree, "pdf:Producer", "//rdf:Description/pdf:Producer/text()") add_metadata(tree, "xmpMM:DocumentID", "//rdf:Description/xmpMM:DocumentID/text()") add_metadata(tree, "xmpMM:InstanceID", "//rdf:Description/xmpMM:InstanceID/text()") except lxml.etree.XMLSyntaxError,e: print e else: pass return metadata def pdf_extract_text_pdfminer(filepath): from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.converter import TextConverter from pdfminer.layout import LAParams outbuf = StringIO.StringIO() rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams()) parser = PDFParser(file(filepath, "rb")) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize("") interpreter = PDFPageInterpreter(rsrcmgr, device) for pagenumber, page in enumerate(doc.get_pages()): interpreter.process_page(page) outbuf.seek(0) content = unicode(outbuf.read(),"utf-8",errors="replace") yield (pagenumber+1, content) #start pages at 1 outbuf.seek(0) def pdf_extract_text_pypdf(filepath): import pyPdf inputfile = pyPdf.PdfFileReader(file(filepath, "r")) pagenumber = 1 for page in inputfile.pages: content = page.extractText() yield (pagenumber, content) pagenumber += 1 """ Yields (pagenumber, text) """ def pdf_extract_text(filepath): try: return pdf_extract_text_pdfminer(filepath) except ImportError: print "Fallback to pypdf" return pdf_extract_text_pypdf(filepath) schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), path=fields.ID(stored=True), content=fields.TEXT(stored=True), ) schema_metadata = fields.Schema( title = fields.TEXT(stored=True), path=fields.ID(stored=True,unique=True), createtime=fields.NUMERIC(stored=True) ) if not os.path.exists(u"index"): create_index = True os.mkdir(u"index") index_book = index.create_in(u"index", schema_book, u"book") index_metadata = index.create_in(u"index", schema_metadata, u"metadata") else: create_index = False index_book = index.open_dir(u"index", u"book") index_metadata = index.open_dir(u"index", u"metadata") filepaths = [] directory = unicode(sys.argv[1], "utf8") searcher_book = index_book.searcher() searcher_metadata = index_metadata.searcher() print str_format(u"Walking {dir}",dir=directory) filecount = 0 skipped = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(u".pdf"): filepath = os.path.join(path, filename) if create_index or not searcher_metadata.document_number(path=filepath): filepaths.append(filepath) filecount += 1 else: #skip files that are already indexed skipped += 1 print str_format(u"\r{count} files found {skip} skipped", count=filecount+skipped, skip=skipped), print "" if not create_index: #update index for deleted files writer_book = index_book.writer() writer_metadata = index_metadata.writer() deleted = 0 processed = 0 for fields in searcher_metadata.all_stored_fields(): path = fields["path"] processed += 1 if not os.path.exists(path): writer_book.delete_by_term(u"path", path) writer_metadata.delete_by_term("path", path) deleted += 1 print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted), print "" writer_book.commit() writer_metadata.commit() searcher_book.close() searcher_metadata.close() def process_file(filepath): try: print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath) title = u"notimplemented" writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) writer_metadata.commit() writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 for pagenumber, content in pdf_extract_text(filepath): print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber) writer_book.add_document(pagenumber=pagenumber, path=filepath, content=content) pagenumber += 1 print str_format(u"{pid} commit", pid=os.getpid()) writer_book.commit() except KeyboardInterrupt: return "KeyboardInterrupt" except Exception,e: print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=str(e)) try: import multiprocessing as mp pool = mp.Pool() jobs = [] for filepath in filepaths: jobs.append( pool.apply_async( process_file, (filepath,) ) ) pool.close() pool.join() except KeyboardInterrupt: pool.terminate() except ImportError: for filepath in filepaths: if process_file(filepath) == "KeyboardInterrupt": break print u"optimize indexes" index_metadata.optimize() index_metadata.close() index_book.optimize() index_book.close()