summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--INSTALL6
-rw-r--r--indexer.py43
-rw-r--r--templates/search.html17
-rw-r--r--web.py57
4 files changed, 75 insertions, 48 deletions
diff --git a/INSTALL b/INSTALL
index 39d4774..fe10593 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,4 +1,3 @@
-
Create a virtualenv
virtualenv create booksearch_env
cd booksearch_env
@@ -31,3 +30,8 @@ Run Webapp
python web.py
* Running on http://0.0.0.0:5000/
* Restarting with reloader...
+
+
+
+Check for
+ http://bitbucket.org/mchaput/whoosh/issue/48/temp-directories-are-not-deleted-when
diff --git a/indexer.py b/indexer.py
index 1508bd4..8ab4cdf 100644
--- a/indexer.py
+++ b/indexer.py
@@ -2,17 +2,16 @@
# coding: utf-8
import os
import sys
+import time
import pyPdf
-from whoosh.index import create_in, open_dir
+import whoosh.index as index
+import whoosh.writing as writing
import whoosh.fields as fields
-import time
-from cStringIO import StringIO
-from Queue import Queue, Empty
import multiprocessing as mp
schema_book = fields.Schema(
pagenumber=fields.NUMERIC(stored=True),
- metadata_docnum=fields.NUMERIC(stored=True),
+ path=fields.ID(stored=True),
content=fields.TEXT(stored=True),
)
@@ -24,13 +23,12 @@ schema_metadata = fields.Schema(
if not os.path.exists(u"index"):
create_index = True
os.mkdir(u"index")
- index_book = create_in(u"index", schema_book, u"book")
- index_metadata = create_in(u"index", schema_metadata, u"metadata")
+ index_book = index.create_in(u"index", schema_book, u"book")
+ index_metadata = index.create_in(u"index", schema_metadata, u"metadata")
else:
create_index = False
- index_book = open_dir(u"index", u"book")
- index_metadata = open_dir(u"index", u"metadata")
-
+ index_book = index.open_dir(u"index", u"book")
+ index_metadata = index.open_dir(u"index", u"metadata")
filepaths = []
directory = unicode(sys.argv[1], "utf8")
@@ -43,12 +41,12 @@ for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(u".pdf"):
filepath = os.path.join(path, filename)
- docnum = create_index or searcher_metadata.document_number(path=filepath)
- if not docnum:
- skipped += 1
- else:
+ if create_index or not searcher_metadata.document_number(path=filepath):
filepaths.append(filepath)
filecount += 1
+ else:
+ #skip files that are already indexed
+ skipped += 1
print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
print ""
@@ -61,7 +59,7 @@ if not create_index: #update index for deleted files
path = fields['path']
processed += 1
if not os.path.exists(path):
- writer_book.delete_by_term(u'metadata_docnum', searcher_metadata.document_number(path=path))
+ writer_book.delete_by_term(u'path', path)
writer_metadata.delete_by_term('path', path)
deleted += 1
print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
@@ -81,20 +79,18 @@ def process_file(filepath):
writer_metadata = index_metadata.writer()
writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
writer_metadata.commit()
- searcher_metadata = index_metadata.searcher()
- metadata_docnum = searcher_metadata.document_number(path=filepath)
- searcher_metadata.close()
+ writer_book = writing.BatchWriter(index_book, limit=1000)
pagenumber = 1
for page in inputfile.pages:
print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber)
content = page.extractText()
- writer_book = index_book.writer()
writer_book.add_document(pagenumber=pagenumber,
- metadata_docnum=metadata_docnum,
- content=content)
- writer_book.commit()
+ path=filepath,
+ content=content)
pagenumber += 1
+
+ writer_book.commit()
except KeyboardInterrupt:
return 'KeyboardInterrupt'
except Exception,e:
@@ -111,8 +107,9 @@ except KeyboardInterrupt:
pool.terminate()
except ImportError:
for filepath in filepaths:
- #if process_file(filepath) == "KeyboardInterrupt":
+ if process_file(filepath) == "KeyboardInterrupt":
break
+
print u"optimize indexes"
index_metadata.optimize()
index_metadata.close()
diff --git a/templates/search.html b/templates/search.html
index 8e0a206..4afe479 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -8,5 +8,20 @@
{% endblock %}
{% block searchValue %}{{ term }}{% endblock %}
{% block content %}
-content
+ {% if matches %}
+ {# Result rendering #}
+ Matched {{ matches.__len__() }} Book{% if matches.__len__() > 1 %}s{% endif %}
+ {% for docnum, matches in matches.items() %}
+ <div class="book">
+ book: {{ docnum }}
+ <img src="{{ url_for("do_download_image", docnum=docnum, page=0) }}"/>
+ {% for match in matches %}
+ <div class="match">
+ {{ match }}
+ </div>
+ {% endfor %}
+ </div>
+ match in {{ docnum }}
+ {% endfor %}
+ {% endif %}
{% endblock %}
diff --git a/web.py b/web.py
index dc1632e..077aa22 100644
--- a/web.py
+++ b/web.py
@@ -32,26 +32,28 @@ def do_download_file(docnum):
filename=os.path.basename(document['path']))
return r
-@app.route("/test")
-def test():
- input = pyPdf.PdfFileReader(file("test.pdf", 'r'))
- page = input.getPage(0)
+@app.route("/download/image/<int:docnum>", methods=["GET"])
+@app.route("/download/image/<int:docnum>/<int:page>", methods=["GET"])
+def do_download_image(docnum,page=0):
+ def generator(process, input):
+ input.seek(0)
+ while not process.stdin.closed:
+ stdout, stderr = process.communicate(input.read())
+ if stdout:
+ yield stdout
+
+ with index_metadata.reader() as reader:
+ document = reader.stored_fields(docnum)
+ filepath = document['path']
+ pdffile = StringIO()
+
+ page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page)
out = pyPdf.PdfFileWriter()
out.addPage(page)
- pdffile = StringIO()
out.write(pdffile)
- process = subprocess.Popen(["/usr/bin/convert", "pdf:-", "jpeg:-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
- outjpg = StringIO()
- pdffile.seek(0)
- while not process.stdin.closed:
- print "."
- stdout, stderr = process.communicate(pdffile.read())
- if not stdout:
- break
- outjpg.write(stdout)
- outjpg.seek(0)
- r = werkzeug.Response(outjpg, mimetype="image/jpeg")
- return r
+ process = subprocess.Popen(["/usr/bin/convert", "pdf:-", "jpeg:-"],
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg")
@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"])
def do_download_page(docnum,page):
@@ -104,14 +106,23 @@ def do_search(skip=0,term=None):
if skip == 0 and not term:
return flask.render_template('search.html', objects=[], term="", skip=0)
+ searcher = index_book.searcher()
query = QueryParser("content").parse(term)
facets = searching.Facets.from_field(searcher, "path")
- results = searcher.search(query, limit=None, sortedby="path")
- categories = {}
- for key, value in facets.categorize(results).items():
- categories[key] = map(lambda v: v[0], value)
-
- return flask.jsonify(categories)
+ results = searcher.search(query, limit=None)
+ categories = facets.categorize(results).items()
+ searcher.close()
+
+ matches = {}
+ for book in categories:
+ filepath = book[0]
+ with index_metadata.searcher() as searcher:
+ docnum = searcher.document_number(path=filepath)
+ matches[docnum] = []
+ for match in book[1]:
+ matches[docnum].append(match)
+
+ return flask.render_template('search.html', matches=matches, term=term)
"""
terms = [text for fieldname, text in query.all_terms()
if fieldname == "content"]