summaryrefslogtreecommitdiff
path: root/index.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-20 01:58:53 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-20 01:58:53 +0100
commit966a17b12c9deab35ef0a804d9fa1faea9c8042d (patch)
tree7ba80df4fcfaaabded6e82a48d139e48e4978952 /index.py
parent0ae1e5e802871903d73d6542252aa0a8d08fba39 (diff)
downloadbooksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.tar.gz
booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.zip
page based index; download page; download file
Diffstat (limited to 'index.py')
-rw-r--r--index.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/index.py b/index.py
index 3758233..5a982a9 100644
--- a/index.py
+++ b/index.py
@@ -13,6 +13,7 @@ from threading import Thread, Condition
schema = fields.Schema(
title=fields.TEXT(stored=True),
path=fields.ID(stored=True),
+ pagenumber=fields.NUMERIC(stored=True),
content=fields.TEXT(stored=True),
createtime=fields.NUMERIC() )
@@ -48,10 +49,11 @@ class PDFWorker(Thread):
print u"{0} processing {1}".format(self.name, filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
- content = u""
+ pagenumber = 0
for page in inputfile.pages:
- content += page.extractText()
- documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } )
+ pagenumber += 1
+ content = page.extractText()
+ documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
except Exception, e:
print u"{0} Exception: {1}".format(self.name, str(e))
finally:
@@ -65,12 +67,12 @@ class IndexWorker(Thread):
doc = documents.get(True, 0.5)
except Empty:
continue
- print u"{0} adding {1}".format(self.name, doc['path'])
+ print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
writer = index.writer()
writer.add_document(**doc)
writer.commit()
documents.task_done()
- print u"{0} added {1}".format(self.name, doc['path'])
+ print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
threads = map(lambda i: PDFWorker(), range(1))
for thread in threads: