#!/usr/bin/python2.6
# coding: utf-8
import os
import sys
import pyPdf
from whoosh.index import create_in, open_dir
import whoosh.fields as fields
import time
from cStringIO import StringIO
from Queue import Queue, Empty
from threading import Thread, Condition

schema = fields.Schema(
    title=fields.TEXT(stored=True),
    path=fields.ID(stored=True),
    pagenumber=fields.NUMERIC(stored=True),
    content=fields.TEXT(stored=True),
    createtime=fields.NUMERIC() )

if not os.path.exists("index"):
    os.mkdir("index")
    index = create_in(u"index", schema)
else:
    index = open_dir("index")

filepaths = Queue()
documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead
notifier = Condition()

directory = unicode(sys.argv[1], "utf8")
searcher = index.searcher()
print u"Walking {0}".format(directory)
filecount = 0
skipped = 0
for path, directories, files in os.walk(directory):
    for filename in files:
        if filename.endswith(".pdf"):
            filepath = os.path.join(path, filename)
            docnum = searcher.document_number(path=filepath)
            if not docnum:
                skipped += 1
            else:
                filepaths.put(filepath)
                filecount += 1
            print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
print ""

writer = index.writer()
deleted = 0
processed = 0
for fields in searcher.all_stored_fields():
    path = fields['path']
    processed += 1
    if not os.path.exists(path):
        writer.delete_by_term('path', path)
        deleted += 1
    print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
print ""

writer.commit()
searcher.close()

class PDFWorker(Thread):
    def run(self):
        while True:
            try:
                filepath = filepaths.get(False)
            except Empty:
                break
            try:
                print u"{0} processing {1}".format(self.name, filepath)
                inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
                title = inputfile.getDocumentInfo().title
                pagenumber = 0
                for page in inputfile.pages:
                    pagenumber += 1
                    content = page.extractText()
                    documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
            except Exception, e:
                print u"{0} Exception: {1}".format(self.name, str(e))
            finally:
                print u"{0} finished   {1}".format(self.name, filepath)
                filepaths.task_done()

class IndexWorker(Thread):
    def run(self):
        while index != None:
            try:
                doc = documents.get(True, 0.5)
            except Empty:
                continue
            print u"{0} adding     {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
            writer = index.writer()
            writer.add_document(**doc)
            writer.commit()
            documents.task_done()
            print u"{0} added      {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])

threads = map(lambda i: PDFWorker(), range(1))
for thread in threads:
    thread.start()

idx = IndexWorker()
idx.start()
print "all running" 

for thread in threads:
    thread.join()

idx.join()

oldindex = index
index = None
print "optimize index"
oldindex.optimize()
oldindex.close()