web.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

#!/usr/bin/python2.6
# coding: utf-8
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import whoosh.fields as fields
import whoosh.analysis as analysis
from whoosh import highlight
import flask
from flask import Flask
import pyPdf
from StringIO import StringIO
import werkzeug
app = Flask("booksearch")

index = open_dir(u"index", mapped=False)
searcher = index.searcher()

@app.route("/")
def do_index():
    return flask.redirect(flask.url_for("do_search",term=""))

@app.route("/download/file/<int:docnum>")
def do_download_file(docnum):
   document = index.reader().stored_fields(docnum)
   filepath = document['path']
   return werkzeug.Response(open(filepath, "r"), mimetype="application/pdf")

@app.route("/download/page/<int:docnum>", methods=["GET"])
def do_download_page(docnum):
   document = index.reader().stored_fields(docnum)
   filepath = document['path']
   pagenumber = document['pagenumber']
   inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
   page = inputfile.getPage(pagenumber)
   outbuf = StringIO()
   outfile = pyPdf.PdfFileWriter()
   outfile.addPage(page)
   outfile.write(outbuf)
   outbuf.seek(0)
   return werkzeug.Response(outbuf, mimetype="application/pdf")

@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
@app.route("/search/", methods=["GET"])
def do_search(skip=0,term=None):
    if skip == 0 and not term:
        return flask.render_template('search.html', objects=[], term="", skip=0)

    query = QueryParser("content").parse(term)
    results = searcher.search(query, limit=skip+5)

    terms = [text for fieldname, text in query.all_terms()
                    if fieldname == "content"]
    objects = []
    for result in results[skip:skip+5]:
        title = result.get("title")
        path = result.get("path")
        print path
        high = highlight.highlight(result.get("content"), 
            terms, 
            analysis.StandardAnalyzer(),
            highlight.SimpleFragmenter(),
            highlight.HtmlFormatter())
        objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum })

    return flask.render_template('search.html', objects=objects, term=term, skip=skip)

if __name__ == "__main__":
    app.debug = True
    app.run(host="0.0.0.0")