#!/usr/bin/python2.6 # coding: utf-8 import os import sys import pyPdf from whoosh.index import create_in, open_dir import whoosh.fields as fields import time from cStringIO import StringIO from Queue import Queue, Empty from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), pagenumber=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) if not os.path.exists("index"): os.mkdir("index") index = create_in(u"index", schema) else: index = open_dir("index") filepaths = Queue() documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead notifier = Condition() directory = unicode(sys.argv[1], "utf8") searcher = index.searcher() print u"Walking {0}".format(directory) filecount = 0 skipped = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): filepath = os.path.join(path, filename) docnum = searcher.document_number(path=filepath) if not docnum: skipped += 1 else: filepaths.put(filepath) filecount += 1 print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" writer = index.writer() deleted = 0 processed = 0 for fields in searcher.all_stored_fields(): path = fields['path'] processed += 1 if not os.path.exists(path): writer.delete_by_term('path', path) deleted += 1 print u"\r{0} pages processed. {1} deleted".format(processed, deleted), print "" writer.commit() searcher.close() class PDFWorker(Thread): def run(self): while True: try: filepath = filepaths.get(False) except Empty: break try: print u"{0} processing {1}".format(self.name, filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title pagenumber = 0 for page in inputfile.pages: pagenumber += 1 content = page.extractText() documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) except Exception, e: print u"{0} Exception: {1}".format(self.name, str(e)) finally: print u"{0} finished {1}".format(self.name, filepath) filepaths.task_done() class IndexWorker(Thread): def run(self): while index != None: try: doc = documents.get(True, 0.5) except Empty: continue print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() writer.add_document(**doc) writer.commit() documents.task_done() print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) threads = map(lambda i: PDFWorker(), range(1)) for thread in threads: thread.start() idx = IndexWorker() idx.start() print "all running" for thread in threads: thread.join() idx.join() oldindex = index index = None print "optimize index" oldindex.optimize() oldindex.close()