summaryrefslogtreecommitdiff
path: root/web.py
blob: b5494c6a8c34a72e3200ee949a9de2b0f4c12cdb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/python2.6
# coding: utf-8
import os
from StringIO import StringIO
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import whoosh.fields as fields
import whoosh.analysis as analysis
import whoosh.highlight as highlight
import flask
import pyPdf
import werkzeug

app = flask.Flask("booksearch")

index = open_dir(u"index", mapped=False)
searcher = index.searcher()

@app.route("/")
def do_index():
    return flask.redirect(flask.url_for("do_search",term=""))

@app.route("/download/file/<int:docnum>")
def do_download_file(docnum):
    document = index.reader().stored_fields(docnum)
    r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
    r.headers.add('Content-Disposition', 'attachment', 
        filename=os.path.basename(document['path']))
    return r

@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"])
def do_download_page(docnum,page):
    document = index.reader().stored_fields(docnum)
    inputfile = pyPdf.PdfFileReader(file(document['path'], 'r'))
    page = inputfile.getPage(page)
    outbuf = StringIO()
    outfile = pyPdf.PdfFileWriter()
    outfile.addPage(page)
    outfile.write(outbuf)
    outbuf.seek(0)
    r = werkzeug.Response(outbuf, mimetype="application/pdf")
    client_filename = os.path.basename(document['path'])[:-3]
    client_filename += u".Page-{0}".format(document['pagenumber'])
    r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
    return r

class MyHtmlFormatter(highlight.HtmlFormatter):
    def _format_fragment(self, text, fragment, seen):
        text = unicode( flask.Markup.escape(text) )
        return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen)

@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
@app.route("/search/", methods=["GET"])
def do_search(skip=0,term=None):
    if skip == 0 and not term:
        return flask.render_template('search.html', objects=[], term="", skip=0)

    query = QueryParser("content").parse(term)
    results = searcher.search(query, limit=1001, sortedby="path")

    terms = [text for fieldname, text in query.all_terms()
                    if fieldname == "content"]

    matches = []
    for result in results:
        title = result.get("title")
        path = result.get("path")
        excerpt = highlight.highlight(result.get("content"), 
                    terms, 
                    analysis.StandardAnalyzer(),
                    highlight.SimpleFragmenter(),
                    MyHtmlFormatter())
        matches.append( {'path':result.get('path'),
                         'excerpt':excerpt, 
                         'docnum':result.docnum, 
                         'pagenumber':result.get('pagenumber'), 
                         'title':title })
    match_groups = {}
    for match in matches:
        print match
        if not match_groups.has_key(match['path']):
            match_groups[match['path']] = { 'matches':[],
                                            'title' : match['title'],
                                            'first_docnum' : match['docnum'],
                                            'filename':os.path.basename(match['path']) }
        match_groups[ match['path'] ]['matches'].append(
            { 'excerpt':match['excerpt'],
              'docnum':match['docnum'],
              'pagenumber':match['pagenumber'] })

    objects = match_groups.values()[skip:skip+5]
    return flask.render_template('search.html', 
        match_groups=objects, term=term, skip=skip, resultlen=len(results))

if __name__ == "__main__":
    app.debug = True
    app.run(host="0.0.0.0")