1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
#!/usr/bin/python2.6
# coding: utf-8
import os
from StringIO import StringIO
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import whoosh.fields as fields
import whoosh.analysis as analysis
import whoosh.highlight as highlight
import flask
import pyPdf
import werkzeug
app = flask.Flask("booksearch")
index = open_dir(u"index", mapped=False)
searcher = index.searcher()
@app.route("/")
def do_index():
return flask.redirect(flask.url_for("do_search",term=""))
@app.route("/download/file/<int:docnum>")
def do_download_file(docnum):
document = index.reader().stored_fields(docnum)
r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
r.headers.add('Content-Disposition', 'attachment',
filename=os.path.basename(document['path']))
return r
@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"])
def do_download_page(docnum,page):
document = index.reader().stored_fields(docnum)
inputfile = pyPdf.PdfFileReader(file(document['path'], 'r'))
page = inputfile.getPage(page)
outbuf = StringIO()
outfile = pyPdf.PdfFileWriter()
outfile.addPage(page)
outfile.write(outbuf)
outbuf.seek(0)
r = werkzeug.Response(outbuf, mimetype="application/pdf")
client_filename = os.path.basename(document['path'])[:-3]
client_filename += u".Page-{0}".format(document['pagenumber'])
r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
return r
class MyHtmlFormatter(highlight.HtmlFormatter):
def _format_fragment(self, text, fragment, seen):
text = unicode( flask.Markup.escape(text) )
return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen)
@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
@app.route("/search/", methods=["GET"])
def do_search(skip=0,term=None):
if skip == 0 and not term:
return flask.render_template('search.html', objects=[], term="", skip=0)
query = QueryParser("content").parse(term)
results = searcher.search(query, limit=1001, sortedby="path")
terms = [text for fieldname, text in query.all_terms()
if fieldname == "content"]
matches = []
for result in results:
title = result.get("title")
path = result.get("path")
excerpt = highlight.highlight(result.get("content"),
terms,
analysis.StandardAnalyzer(),
highlight.SimpleFragmenter(),
MyHtmlFormatter())
matches.append( {'path':result.get('path'),
'excerpt':excerpt,
'docnum':result.docnum,
'pagenumber':result.get('pagenumber'),
'title':title })
match_groups = {}
for match in matches:
print match
if not match_groups.has_key(match['path']):
match_groups[match['path']] = { 'matches':[],
'title' : match['title'],
'first_docnum' : match['docnum'],
'filename':os.path.basename(match['path']) }
match_groups[ match['path'] ]['matches'].append(
{ 'excerpt':match['excerpt'],
'docnum':match['docnum'],
'pagenumber':match['pagenumber'] })
objects = match_groups.values()[skip:skip+5]
return flask.render_template('search.html',
match_groups=objects, term=term, skip=skip, resultlen=len(results))
if __name__ == "__main__":
app.debug = True
app.run(host="0.0.0.0")
|