diff options
-rw-r--r-- | index.py | 99 | ||||
-rw-r--r-- | query.py | 57 | ||||
-rw-r--r-- | quick.py | 51 |
3 files changed, 90 insertions, 117 deletions
@@ -3,9 +3,12 @@ import os import sys import pyPdf -from whoosh.index import create_in +from whoosh.index import create_in, open_dir import whoosh.fields as fields import time +from cStringIO import StringIO +from Queue import Queue, Empty +from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), @@ -15,24 +18,83 @@ schema = fields.Schema( if not os.path.exists("index"): os.mkdir("index") + index = create_in(u"index", schema) +else: + index = open_dir("index") -index = create_in(u"index", schema, "books") -writer = index.writer() +filepaths = Queue() +documents = Queue() +notifier = Condition() +directory = unicode(sys.argv[0], "utf8") +filecount = 0 +for path, directories, files in os.walk(directory): + for filename in files: + if filename.endswith(".pdf"): + filepaths.put(os.path.join(path, filename)) + filecount += 1 + print u"\r{0} files found".format(filecount), +print "" -# extract -directory = "/tank/share/books/isbn" +class PDFWorker(Thread): + def run(self): + while True: + try: + filepath = filepaths.get(False) + except Empty: + break + try: + print u"{0} processing {1}".format(self.name, filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + content = u"" + for page in inputfile.pages: + content += page.extractText() + documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } ) + except Exception, e: + print u"{0} Exception: {1}".format(self.name, str(e)) + finally: + filepaths.task_done() + +class IndexWorker(Thread): + def run(self): + while index != None: + try: + doc = documents.get(True, 0.5) + except Empty: + continue + writer = index.writer() + writer.add_document(**doc) + documents.task_done() + print u"Added {0}".format(doc['path']) + writer.commit() + +threads = map(lambda i: PDFWorker(), range(4)) +for thread in threads: + thread.start() + +idx = IndexWorker() +idx.start() +print "all running" -try: - for path, directories, files in os.walk(directory): - for filename in files: - if filename.endswith(".pdf"): +for thread in threads: + thread.join() + +oldindex = index +index = None +print "optimize index" +oldindex.optimize() +oldindex.close() + +""" + + try: filepath = os.path.join(path, filename) print u"Process {0}".format(filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title - content = u"" i=1 + content = "" numpages = inputfile.getNumPages() for page in inputfile.pages: sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) @@ -40,15 +102,10 @@ try: content += page.extractText() i+=1 print u"" + writer = index.writer() writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) -except KeyboardInterrupt: - writer.commit() - -from whoosh.qparser import QueryParser - -searcher = index.searcher() - -query = QueryParser("content").parse("world") - -results = searcher.search(query) -print results + writer.commit() + except Exception,e: + print e +""" +index.close() @@ -1,54 +1,21 @@ #!/usr/bin/python2.6 # coding: utf-8 -import os -import sys -import pyPdf -from whoosh.index import create_in -import whoosh.fields as fields -import time - -schema = fields.Schema( - title=fields.TEXT(stored=True), - path=fields.ID(stored=True), - content=fields.TEXT(stored=True), - createtime=fields.NUMERIC() ) - -if not os.path.exists("index"): - os.mkdir("index") - -index = create_in(u"index", schema, "books") -writer = index.writer() +from whoosh.index import open_dir +from whoosh.qparser import QueryParser +index = open_dir(u"index", mapped=False) -# extract -directory = "/tank/share/books/isbn" -try: - for path, directories, files in os.walk(directory): - for filename in files: - if filename.endswith(".pdf"): - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - content = u"" - i=1 - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) -except KeyboardInterrupt: - writer.commit() +searcher = index.searcher() -from whoosh.qparser import QueryParser +while True: + term = raw_input("query> ") + query = QueryParser("content").parse(term) + print query -searcher = index.searcher() + results = searcher.search(query) + for result in results: + print "Match in {0}".format(result.get("path")) -query = QueryParser("content").parse("world") + print "{0} results".format(len(results)) -results = searcher.search(query) -print results diff --git a/quick.py b/quick.py deleted file mode 100644 index 631460d..0000000 --- a/quick.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/python2.6 -# coding: utf-8 -import os -import sys -import pyPdf -from whoosh.index import create_in -import whoosh.fields as fields -import time - -schema = fields.Schema( - title=fields.TEXT(stored=True), - path=fields.ID(stored=True), - content=fields.TEXT(stored=True), - createtime=fields.NUMERIC() ) - -index = create_in("index", schema, "books") -writer = index.writer() - - -# extract -directory = u"/media/share/books/isbn" - -try: - for path, directories, files in os.walk(directory): - for filename in files: - if filename.endswith(".pdf"): - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - content = u"" - i=1 - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) -except KeyboardInterrupt: - writer.commit() - -from whoosh.qparser import QueryParser - -searcher = index.searcher() - -query = QueryParser("content").parse("world") - -results = searcher.search(query) -print results |