#!/usr/bin/python2.6 # coding: utf-8 import os import sys import pyPdf from whoosh.index import create_in, open_dir import whoosh.fields as fields import time from cStringIO import StringIO from Queue import Queue, Empty from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) if not os.path.exists("index"): os.mkdir("index") index = create_in(u"index", schema) else: index = open_dir("index") filepaths = Queue() documents = Queue() notifier = Condition() directory = unicode(sys.argv[1], "utf8") print u"Walking {0}".format(directory) filecount = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): filepaths.put(os.path.join(path, filename)) filecount += 1 print u"\r{0} files found".format(filecount), print "" class PDFWorker(Thread): def run(self): while True: try: filepath = filepaths.get(False) except Empty: break try: print u"{0} processing {1}".format(self.name, filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title content = u"" for page in inputfile.pages: content += page.extractText() documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } ) except Exception, e: print u"{0} Exception: {1}".format(self.name, str(e)) finally: print u"{0} finished {1}".format(self.name, filepath) filepaths.task_done() class IndexWorker(Thread): def run(self): while index != None: try: doc = documents.get(True, 0.5) except Empty: continue writer = index.writer() writer.add_document(**doc) documents.task_done() print u"{0} added {1}".format(self.name, doc['path']) writer.commit() threads = map(lambda i: PDFWorker(), range(1)) for thread in threads: thread.start() idx = IndexWorker() idx.start() print "all running" for thread in threads: thread.join() oldindex = index index = None print "optimize index" oldindex.optimize() oldindex.close() """ try: filepath = os.path.join(path, filename) print u"Process {0}".format(filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title i=1 content = "" numpages = inputfile.getNumPages() for page in inputfile.pages: sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) sys.stdout.flush() content += page.extractText() i+=1 print u"" writer = index.writer() writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) writer.commit() except Exception,e: print e """