diff options
author | yvesf <yvesf-git@xapek.org> | 2010-11-20 01:58:53 +0100 |
---|---|---|
committer | yvesf <yvesf-git@xapek.org> | 2010-11-20 01:58:53 +0100 |
commit | 966a17b12c9deab35ef0a804d9fa1faea9c8042d (patch) | |
tree | 7ba80df4fcfaaabded6e82a48d139e48e4978952 | |
parent | 0ae1e5e802871903d73d6542252aa0a8d08fba39 (diff) | |
download | booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.tar.gz booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.zip |
page based index; download page; download file
-rw-r--r-- | index.py | 12 | ||||
-rw-r--r-- | templates/search.html | 21 | ||||
-rw-r--r-- | web.py | 46 |
3 files changed, 65 insertions, 14 deletions
@@ -13,6 +13,7 @@ from threading import Thread, Condition schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), + pagenumber=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) @@ -48,10 +49,11 @@ class PDFWorker(Thread): print u"{0} processing {1}".format(self.name, filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title - content = u"" + pagenumber = 0 for page in inputfile.pages: - content += page.extractText() - documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } ) + pagenumber += 1 + content = page.extractText() + documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) except Exception, e: print u"{0} Exception: {1}".format(self.name, str(e)) finally: @@ -65,12 +67,12 @@ class IndexWorker(Thread): doc = documents.get(True, 0.5) except Empty: continue - print u"{0} adding {1}".format(self.name, doc['path']) + print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() writer.add_document(**doc) writer.commit() documents.task_done() - print u"{0} added {1}".format(self.name, doc['path']) + print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) threads = map(lambda i: PDFWorker(), range(1)) for thread in threads: diff --git a/templates/search.html b/templates/search.html index e6bfe07..0c5f3d6 100644 --- a/templates/search.html +++ b/templates/search.html @@ -1,14 +1,33 @@ <html> <head> <title>{{ objects.__len__() + skip}} matches</title> +<script> +function start() { +var inputField = document.getElementById("search"); +window.location = "/search/" + inputField.value; +} +</script> </head> <body> +<div> +<input id="search" type="text"/> +<button onclick="start()">Go</button> +</div> {% if objects.__len__() == 0 %} No Matches {% else %} {% for obj in objects %} <hr /> - <h2> {{ obj['title'] }} </h2> + <h2> + {{ obj['title'] }} + <a href="{{ url_for("do_download_page", docnum=obj['docnum']) }}"> + This Page + </a> + - + <a href="{{ url_for("do_download_file", docnum=obj['docnum']) }}"> + File + </a> + </h2> <pre> {{ obj['path'] }} </pre> {% autoescape false %} <div>{{ obj['excerpt'] }}</div> @@ -7,16 +7,45 @@ import whoosh.analysis as analysis from whoosh import highlight import flask from flask import Flask - +import pyPdf +from StringIO import StringIO +import werkzeug app = Flask("booksearch") index = open_dir(u"index", mapped=False) searcher = index.searcher() +@app.route("/") +def do_index(): + return flask.redirect(flask.url_for("do_search",term="")) + +@app.route("/download/file/<int:docnum>") +def do_download_file(docnum): + document = index.reader().stored_fields(docnum) + filepath = document['path'] + return werkzeug.Response(open(filepath, "r"), mimetype="application/pdf") + +@app.route("/download/page/<int:docnum>", methods=["GET"]) +def do_download_page(docnum): + document = index.reader().stored_fields(docnum) + filepath = document['path'] + pagenumber = document['pagenumber'] + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + page = inputfile.getPage(pagenumber) + outbuf = StringIO() + outfile = pyPdf.PdfFileWriter() + outfile.addPage(page) + outfile.write(outbuf) + outbuf.seek(0) + return werkzeug.Response(outbuf, mimetype="application/pdf") @app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"]) @app.route("/search/<path:term>",methods=["GET"]) +@app.route("/search/", methods=["GET"]) def do_search(skip=0,term=None): + if skip == 0 and not term: + return flask.render_template('search.html', objects=[], term="", skip=0) + query = QueryParser("content").parse(term) results = searcher.search(query, limit=skip+5) @@ -26,14 +55,15 @@ def do_search(skip=0,term=None): for result in results[skip:skip+5]: title = result.get("title") path = result.get("path") -# high = highlight.highlight(result.get("content"), -# terms, -# analysis.StandardAnalyzer(), -# highlight.SimpleFragmenter(), -# highlight.HtmlFormatter()) - objects.append({ 'title' : title, 'path' : path, 'excerpt' : 'TODO' }) - return flask.render_template('search.html', objects=objects, term=term, skip=skip) + print path + high = highlight.highlight(result.get("content"), + terms, + analysis.StandardAnalyzer(), + highlight.SimpleFragmenter(), + highlight.HtmlFormatter()) + objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum }) + return flask.render_template('search.html', objects=objects, term=term, skip=skip) if __name__ == "__main__": app.debug = True |