From bec8150c93a640ac4b6d1cebc86bd721dfdd6b59 Mon Sep 17 00:00:00 2001 From: yvesf Date: Sat, 20 Nov 2010 02:12:50 +0100 Subject: update INSTALL help; rename index->indexer; prettify web.py --- INSTALL | 33 ++++++++++++++++++ LOG | 7 ---- index.py | 115 ------------------------------------------------------------- indexer.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ web.py | 8 ++--- 5 files changed, 152 insertions(+), 126 deletions(-) create mode 100644 INSTALL delete mode 100644 LOG delete mode 100644 index.py create mode 100644 indexer.py diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..39d4774 --- /dev/null +++ b/INSTALL @@ -0,0 +1,33 @@ + +Create a virtualenv + virtualenv create booksearch_env + cd booksearch_env + # activate this virtualenv + . ./bin/activate + +Install dependencies + easy_install pip + pip install whoosh + pip install pypdf + pip install flask + +( Clone ) + git clone http://xapek.org/~yvesf/repos/booksearch.git + cd booksearch + +Create index + python indexer.py ~/my_books + +Test index + python query.py + query> test + Term('content', 'test') + Match in /home/XXXXX6-4.pdf + Match in /home/XXXXX6-4.pdf + 2 results + query> + +Run Webapp + python web.py + * Running on http://0.0.0.0:5000/ + * Restarting with reloader... diff --git a/LOG b/LOG deleted file mode 100644 index e850077..0000000 --- a/LOG +++ /dev/null @@ -1,7 +0,0 @@ -virtualenv create booksearch_env -cd booksearch_env -. ./bin/activate -easy_install pip -pip install whoosh -pip install pypdf - diff --git a/index.py b/index.py deleted file mode 100644 index 5a982a9..0000000 --- a/index.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/python2.6 -# coding: utf-8 -import os -import sys -import pyPdf -from whoosh.index import create_in, open_dir -import whoosh.fields as fields -import time -from cStringIO import StringIO -from Queue import Queue, Empty -from threading import Thread, Condition - -schema = fields.Schema( - title=fields.TEXT(stored=True), - path=fields.ID(stored=True), - pagenumber=fields.NUMERIC(stored=True), - content=fields.TEXT(stored=True), - createtime=fields.NUMERIC() ) - -if not os.path.exists("index"): - os.mkdir("index") - index = create_in(u"index", schema) -else: - index = open_dir("index") - -filepaths = Queue() -documents = Queue() -notifier = Condition() - -directory = unicode(sys.argv[1], "utf8") -print u"Walking {0}".format(directory) -filecount = 0 -for path, directories, files in os.walk(directory): - for filename in files: - if filename.endswith(".pdf"): - filepaths.put(os.path.join(path, filename)) - filecount += 1 - print u"\r{0} files found".format(filecount), -print "" - -class PDFWorker(Thread): - def run(self): - while True: - try: - filepath = filepaths.get(False) - except Empty: - break - try: - print u"{0} processing {1}".format(self.name, filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - pagenumber = 0 - for page in inputfile.pages: - pagenumber += 1 - content = page.extractText() - documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) - except Exception, e: - print u"{0} Exception: {1}".format(self.name, str(e)) - finally: - print u"{0} finished {1}".format(self.name, filepath) - filepaths.task_done() - -class IndexWorker(Thread): - def run(self): - while index != None: - try: - doc = documents.get(True, 0.5) - except Empty: - continue - print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) - writer = index.writer() - writer.add_document(**doc) - writer.commit() - documents.task_done() - print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) - -threads = map(lambda i: PDFWorker(), range(1)) -for thread in threads: - thread.start() - -idx = IndexWorker() -idx.start() -print "all running" - -for thread in threads: - thread.join() - -oldindex = index -index = None -print "optimize index" -oldindex.optimize() -oldindex.close() - -""" - - try: - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - i=1 - content = "" - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer = index.writer() - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) - writer.commit() - except Exception,e: - print e -""" diff --git a/indexer.py b/indexer.py new file mode 100644 index 0000000..5a982a9 --- /dev/null +++ b/indexer.py @@ -0,0 +1,115 @@ +#!/usr/bin/python2.6 +# coding: utf-8 +import os +import sys +import pyPdf +from whoosh.index import create_in, open_dir +import whoosh.fields as fields +import time +from cStringIO import StringIO +from Queue import Queue, Empty +from threading import Thread, Condition + +schema = fields.Schema( + title=fields.TEXT(stored=True), + path=fields.ID(stored=True), + pagenumber=fields.NUMERIC(stored=True), + content=fields.TEXT(stored=True), + createtime=fields.NUMERIC() ) + +if not os.path.exists("index"): + os.mkdir("index") + index = create_in(u"index", schema) +else: + index = open_dir("index") + +filepaths = Queue() +documents = Queue() +notifier = Condition() + +directory = unicode(sys.argv[1], "utf8") +print u"Walking {0}".format(directory) +filecount = 0 +for path, directories, files in os.walk(directory): + for filename in files: + if filename.endswith(".pdf"): + filepaths.put(os.path.join(path, filename)) + filecount += 1 + print u"\r{0} files found".format(filecount), +print "" + +class PDFWorker(Thread): + def run(self): + while True: + try: + filepath = filepaths.get(False) + except Empty: + break + try: + print u"{0} processing {1}".format(self.name, filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + pagenumber = 0 + for page in inputfile.pages: + pagenumber += 1 + content = page.extractText() + documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) + except Exception, e: + print u"{0} Exception: {1}".format(self.name, str(e)) + finally: + print u"{0} finished {1}".format(self.name, filepath) + filepaths.task_done() + +class IndexWorker(Thread): + def run(self): + while index != None: + try: + doc = documents.get(True, 0.5) + except Empty: + continue + print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) + writer = index.writer() + writer.add_document(**doc) + writer.commit() + documents.task_done() + print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) + +threads = map(lambda i: PDFWorker(), range(1)) +for thread in threads: + thread.start() + +idx = IndexWorker() +idx.start() +print "all running" + +for thread in threads: + thread.join() + +oldindex = index +index = None +print "optimize index" +oldindex.optimize() +oldindex.close() + +""" + + try: + filepath = os.path.join(path, filename) + print u"Process {0}".format(filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + i=1 + content = "" + numpages = inputfile.getNumPages() + for page in inputfile.pages: + sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) + sys.stdout.flush() + content += page.extractText() + i+=1 + print u"" + writer = index.writer() + writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) + writer.commit() + except Exception,e: + print e +""" diff --git a/web.py b/web.py index b747010..babc9dd 100644 --- a/web.py +++ b/web.py @@ -1,16 +1,16 @@ #!/usr/bin/python2.6 # coding: utf-8 +from StringIO import StringIO from whoosh.index import open_dir from whoosh.qparser import QueryParser import whoosh.fields as fields import whoosh.analysis as analysis -from whoosh import highlight +import whoosh.highlight as highlight import flask -from flask import Flask import pyPdf -from StringIO import StringIO import werkzeug -app = Flask("booksearch") + +app = flask.Flask("booksearch") index = open_dir(u"index", mapped=False) searcher = index.searcher() -- cgit v1.2.1