diff options
author | yvesf <yvesf-git@xapek.org> | 2010-11-20 01:58:53 +0100 |
---|---|---|
committer | yvesf <yvesf-git@xapek.org> | 2010-11-20 01:58:53 +0100 |
commit | 966a17b12c9deab35ef0a804d9fa1faea9c8042d (patch) | |
tree | 7ba80df4fcfaaabded6e82a48d139e48e4978952 /index.py | |
parent | 0ae1e5e802871903d73d6542252aa0a8d08fba39 (diff) | |
download | booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.tar.gz booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.zip |
page based index; download page; download file
Diffstat (limited to 'index.py')
-rw-r--r-- | index.py | 12 |
1 files changed, 7 insertions, 5 deletions
@@ -13,6 +13,7 @@ from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), + pagenumber=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) @@ -48,10 +49,11 @@ class PDFWorker(Thread): print u"{0} processing {1}".format(self.name, filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title - content = u"" + pagenumber = 0 for page in inputfile.pages: - content += page.extractText() - documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } ) + pagenumber += 1 + content = page.extractText() + documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) except Exception, e: print u"{0} Exception: {1}".format(self.name, str(e)) finally: @@ -65,12 +67,12 @@ class IndexWorker(Thread): doc = documents.get(True, 0.5) except Empty: continue - print u"{0} adding {1}".format(self.name, doc['path']) + print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() writer.add_document(**doc) writer.commit() documents.task_done() - print u"{0} added {1}".format(self.name, doc['path']) + print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) threads = map(lambda i: PDFWorker(), range(1)) for thread in threads: |