diff options
Diffstat (limited to 'index.py')
-rw-r--r-- | index.py | 12 |
1 files changed, 7 insertions, 5 deletions
@@ -13,6 +13,7 @@ from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), + pagenumber=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) @@ -48,10 +49,11 @@ class PDFWorker(Thread): print u"{0} processing {1}".format(self.name, filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title - content = u"" + pagenumber = 0 for page in inputfile.pages: - content += page.extractText() - documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } ) + pagenumber += 1 + content = page.extractText() + documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) except Exception, e: print u"{0} Exception: {1}".format(self.name, str(e)) finally: @@ -65,12 +67,12 @@ class IndexWorker(Thread): doc = documents.get(True, 0.5) except Empty: continue - print u"{0} adding {1}".format(self.name, doc['path']) + print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() writer.add_document(**doc) writer.commit() documents.task_done() - print u"{0} added {1}".format(self.name, doc['path']) + print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) threads = map(lambda i: PDFWorker(), range(1)) for thread in threads: |