summaryrefslogtreecommitdiff
path: root/index.py
diff options
context:
space:
mode:
Diffstat (limited to 'index.py')
-rw-r--r--index.py99
1 files changed, 78 insertions, 21 deletions
diff --git a/index.py b/index.py
index f7f70f3..1161165 100644
--- a/index.py
+++ b/index.py
@@ -3,9 +3,12 @@
import os
import sys
import pyPdf
-from whoosh.index import create_in
+from whoosh.index import create_in, open_dir
import whoosh.fields as fields
import time
+from cStringIO import StringIO
+from Queue import Queue, Empty
+from threading import Thread, Condition
schema = fields.Schema(
title=fields.TEXT(stored=True),
@@ -15,24 +18,83 @@ schema = fields.Schema(
if not os.path.exists("index"):
os.mkdir("index")
+ index = create_in(u"index", schema)
+else:
+ index = open_dir("index")
-index = create_in(u"index", schema, "books")
-writer = index.writer()
+filepaths = Queue()
+documents = Queue()
+notifier = Condition()
+directory = unicode(sys.argv[0], "utf8")
+filecount = 0
+for path, directories, files in os.walk(directory):
+ for filename in files:
+ if filename.endswith(".pdf"):
+ filepaths.put(os.path.join(path, filename))
+ filecount += 1
+ print u"\r{0} files found".format(filecount),
+print ""
-# extract
-directory = "/tank/share/books/isbn"
+class PDFWorker(Thread):
+ def run(self):
+ while True:
+ try:
+ filepath = filepaths.get(False)
+ except Empty:
+ break
+ try:
+ print u"{0} processing {1}".format(self.name, filepath)
+ inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ title = inputfile.getDocumentInfo().title
+ content = u""
+ for page in inputfile.pages:
+ content += page.extractText()
+ documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } )
+ except Exception, e:
+ print u"{0} Exception: {1}".format(self.name, str(e))
+ finally:
+ filepaths.task_done()
+
+class IndexWorker(Thread):
+ def run(self):
+ while index != None:
+ try:
+ doc = documents.get(True, 0.5)
+ except Empty:
+ continue
+ writer = index.writer()
+ writer.add_document(**doc)
+ documents.task_done()
+ print u"Added {0}".format(doc['path'])
+ writer.commit()
+
+threads = map(lambda i: PDFWorker(), range(4))
+for thread in threads:
+ thread.start()
+
+idx = IndexWorker()
+idx.start()
+print "all running"
-try:
- for path, directories, files in os.walk(directory):
- for filename in files:
- if filename.endswith(".pdf"):
+for thread in threads:
+ thread.join()
+
+oldindex = index
+index = None
+print "optimize index"
+oldindex.optimize()
+oldindex.close()
+
+"""
+
+ try:
filepath = os.path.join(path, filename)
print u"Process {0}".format(filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
- content = u""
i=1
+ content = ""
numpages = inputfile.getNumPages()
for page in inputfile.pages:
sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
@@ -40,15 +102,10 @@ try:
content += page.extractText()
i+=1
print u""
+ writer = index.writer()
writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
-except KeyboardInterrupt:
- writer.commit()
-
-from whoosh.qparser import QueryParser
-
-searcher = index.searcher()
-
-query = QueryParser("content").parse("world")
-
-results = searcher.search(query)
-print results
+ writer.commit()
+ except Exception,e:
+ print e
+"""
+index.close()