1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
#!/usr/bin/python2.6
# coding: utf-8
import os
from StringIO import StringIO
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import whoosh.fields as fields
import whoosh.analysis as analysis
import whoosh.highlight as highlight
import flask
import pyPdf
import werkzeug
app = flask.Flask("booksearch")
index = open_dir(u"index", mapped=False)
searcher = index.searcher()
@app.route("/")
def do_index():
return flask.redirect(flask.url_for("do_search",term=""))
@app.route("/download/file/<int:docnum>")
def do_download_file(docnum):
document = index.reader().stored_fields(docnum)
r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
r.headers.add('Content-Disposition', 'attachment',
filename=os.path.basename(document['path']))
return r
@app.route("/download/page/<int:docnum>", methods=["GET"])
def do_download_page(docnum):
document = index.reader().stored_fields(docnum)
inputfile = pyPdf.PdfFileReader(file(document['path'], 'r'))
page = inputfile.getPage(document['pagenumber'])
outbuf = StringIO()
outfile = pyPdf.PdfFileWriter()
outfile.addPage(page)
outfile.write(outbuf)
outbuf.seek(0)
r= werkzeug.Response(outbuf, mimetype="application/pdf")
client_filename = os.path.basename(document['path'])[:-3]
client_filename += u".Page-{0}".format(document['pagenumber'])
r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
return r
@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
@app.route("/search/", methods=["GET"])
def do_search(skip=0,term=None):
if skip == 0 and not term:
return flask.render_template('search.html', objects=[], term="", skip=0)
query = QueryParser("content").parse(term)
results = searcher.search(query, limit=skip+5)
terms = [text for fieldname, text in query.all_terms()
if fieldname == "content"]
objects = []
for result in results[skip:skip+5]:
title = result.get("title")
path = result.get("path")
print path
high = highlight.highlight(result.get("content"),
terms,
analysis.StandardAnalyzer(),
highlight.SimpleFragmenter(),
highlight.HtmlFormatter())
objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum })
return flask.render_template('search.html', objects=objects, term=term, skip=skip)
if __name__ == "__main__":
app.debug = True
app.run(host="0.0.0.0")
|