From 966a17b12c9deab35ef0a804d9fa1faea9c8042d Mon Sep 17 00:00:00 2001 From: yvesf Date: Sat, 20 Nov 2010 01:58:53 +0100 Subject: page based index; download page; download file --- index.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'index.py') diff --git a/index.py b/index.py index 3758233..5a982a9 100644 --- a/index.py +++ b/index.py @@ -13,6 +13,7 @@ from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), + pagenumber=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) @@ -48,10 +49,11 @@ class PDFWorker(Thread): print u"{0} processing {1}".format(self.name, filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title - content = u"" + pagenumber = 0 for page in inputfile.pages: - content += page.extractText() - documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } ) + pagenumber += 1 + content = page.extractText() + documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) except Exception, e: print u"{0} Exception: {1}".format(self.name, str(e)) finally: @@ -65,12 +67,12 @@ class IndexWorker(Thread): doc = documents.get(True, 0.5) except Empty: continue - print u"{0} adding {1}".format(self.name, doc['path']) + print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() writer.add_document(**doc) writer.commit() documents.task_done() - print u"{0} added {1}".format(self.name, doc['path']) + print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) threads = map(lambda i: PDFWorker(), range(1)) for thread in threads: -- cgit v1.2.1