From 5b7f0bdf98e4fffca943e408a60f2fe2e289fef6 Mon Sep 17 00:00:00 2001 From: yvesf Date: Mon, 22 Nov 2010 00:03:26 +0100 Subject: save progress --- INSTALL | 6 +++++- indexer.py | 43 ++++++++++++++++++-------------------- templates/search.html | 17 ++++++++++++++- web.py | 57 ++++++++++++++++++++++++++++++--------------------- 4 files changed, 75 insertions(+), 48 deletions(-) diff --git a/INSTALL b/INSTALL index 39d4774..fe10593 100644 --- a/INSTALL +++ b/INSTALL @@ -1,4 +1,3 @@ - Create a virtualenv virtualenv create booksearch_env cd booksearch_env @@ -31,3 +30,8 @@ Run Webapp python web.py * Running on http://0.0.0.0:5000/ * Restarting with reloader... + + + +Check for + http://bitbucket.org/mchaput/whoosh/issue/48/temp-directories-are-not-deleted-when diff --git a/indexer.py b/indexer.py index 1508bd4..8ab4cdf 100644 --- a/indexer.py +++ b/indexer.py @@ -2,17 +2,16 @@ # coding: utf-8 import os import sys +import time import pyPdf -from whoosh.index import create_in, open_dir +import whoosh.index as index +import whoosh.writing as writing import whoosh.fields as fields -import time -from cStringIO import StringIO -from Queue import Queue, Empty import multiprocessing as mp schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), - metadata_docnum=fields.NUMERIC(stored=True), + path=fields.ID(stored=True), content=fields.TEXT(stored=True), ) @@ -24,13 +23,12 @@ schema_metadata = fields.Schema( if not os.path.exists(u"index"): create_index = True os.mkdir(u"index") - index_book = create_in(u"index", schema_book, u"book") - index_metadata = create_in(u"index", schema_metadata, u"metadata") + index_book = index.create_in(u"index", schema_book, u"book") + index_metadata = index.create_in(u"index", schema_metadata, u"metadata") else: create_index = False - index_book = open_dir(u"index", u"book") - index_metadata = open_dir(u"index", u"metadata") - + index_book = index.open_dir(u"index", u"book") + index_metadata = index.open_dir(u"index", u"metadata") filepaths = [] directory = unicode(sys.argv[1], "utf8") @@ -43,12 +41,12 @@ for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(u".pdf"): filepath = os.path.join(path, filename) - docnum = create_index or searcher_metadata.document_number(path=filepath) - if not docnum: - skipped += 1 - else: + if create_index or not searcher_metadata.document_number(path=filepath): filepaths.append(filepath) filecount += 1 + else: + #skip files that are already indexed + skipped += 1 print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" @@ -61,7 +59,7 @@ if not create_index: #update index for deleted files path = fields['path'] processed += 1 if not os.path.exists(path): - writer_book.delete_by_term(u'metadata_docnum', searcher_metadata.document_number(path=path)) + writer_book.delete_by_term(u'path', path) writer_metadata.delete_by_term('path', path) deleted += 1 print u"\r{0} pages processed. {1} deleted".format(processed, deleted), @@ -81,20 +79,18 @@ def process_file(filepath): writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) writer_metadata.commit() - searcher_metadata = index_metadata.searcher() - metadata_docnum = searcher_metadata.document_number(path=filepath) - searcher_metadata.close() + writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 for page in inputfile.pages: print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber) content = page.extractText() - writer_book = index_book.writer() writer_book.add_document(pagenumber=pagenumber, - metadata_docnum=metadata_docnum, - content=content) - writer_book.commit() + path=filepath, + content=content) pagenumber += 1 + + writer_book.commit() except KeyboardInterrupt: return 'KeyboardInterrupt' except Exception,e: @@ -111,8 +107,9 @@ except KeyboardInterrupt: pool.terminate() except ImportError: for filepath in filepaths: - #if process_file(filepath) == "KeyboardInterrupt": + if process_file(filepath) == "KeyboardInterrupt": break + print u"optimize indexes" index_metadata.optimize() index_metadata.close() diff --git a/templates/search.html b/templates/search.html index 8e0a206..4afe479 100644 --- a/templates/search.html +++ b/templates/search.html @@ -8,5 +8,20 @@ {% endblock %} {% block searchValue %}{{ term }}{% endblock %} {% block content %} -content + {% if matches %} + {# Result rendering #} + Matched {{ matches.__len__() }} Book{% if matches.__len__() > 1 %}s{% endif %} + {% for docnum, matches in matches.items() %} +
+ book: {{ docnum }} + + {% for match in matches %} +
+ {{ match }} +
+ {% endfor %} +
+ match in {{ docnum }} + {% endfor %} + {% endif %} {% endblock %} diff --git a/web.py b/web.py index dc1632e..077aa22 100644 --- a/web.py +++ b/web.py @@ -32,26 +32,28 @@ def do_download_file(docnum): filename=os.path.basename(document['path'])) return r -@app.route("/test") -def test(): - input = pyPdf.PdfFileReader(file("test.pdf", 'r')) - page = input.getPage(0) +@app.route("/download/image/", methods=["GET"]) +@app.route("/download/image//", methods=["GET"]) +def do_download_image(docnum,page=0): + def generator(process, input): + input.seek(0) + while not process.stdin.closed: + stdout, stderr = process.communicate(input.read()) + if stdout: + yield stdout + + with index_metadata.reader() as reader: + document = reader.stored_fields(docnum) + filepath = document['path'] + pdffile = StringIO() + + page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page) out = pyPdf.PdfFileWriter() out.addPage(page) - pdffile = StringIO() out.write(pdffile) - process = subprocess.Popen(["/usr/bin/convert", "pdf:-", "jpeg:-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) - outjpg = StringIO() - pdffile.seek(0) - while not process.stdin.closed: - print "." - stdout, stderr = process.communicate(pdffile.read()) - if not stdout: - break - outjpg.write(stdout) - outjpg.seek(0) - r = werkzeug.Response(outjpg, mimetype="image/jpeg") - return r + process = subprocess.Popen(["/usr/bin/convert", "pdf:-", "jpeg:-"], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg") @app.route("/download/page//", methods=["GET"]) def do_download_page(docnum,page): @@ -104,14 +106,23 @@ def do_search(skip=0,term=None): if skip == 0 and not term: return flask.render_template('search.html', objects=[], term="", skip=0) + searcher = index_book.searcher() query = QueryParser("content").parse(term) facets = searching.Facets.from_field(searcher, "path") - results = searcher.search(query, limit=None, sortedby="path") - categories = {} - for key, value in facets.categorize(results).items(): - categories[key] = map(lambda v: v[0], value) - - return flask.jsonify(categories) + results = searcher.search(query, limit=None) + categories = facets.categorize(results).items() + searcher.close() + + matches = {} + for book in categories: + filepath = book[0] + with index_metadata.searcher() as searcher: + docnum = searcher.document_number(path=filepath) + matches[docnum] = [] + for match in book[1]: + matches[docnum].append(match) + + return flask.render_template('search.html', matches=matches, term=term) """ terms = [text for fieldname, text in query.all_terms() if fieldname == "content"] -- cgit v1.2.1