#!/usr/bin/python2.6 # coding: utf-8 import os from StringIO import StringIO from whoosh.index import open_dir from whoosh.qparser import QueryParser import whoosh.fields as fields import whoosh.analysis as analysis import whoosh.highlight as highlight import flask import pyPdf import werkzeug app = flask.Flask("booksearch") index = open_dir(u"index", mapped=False) searcher = index.searcher() @app.route("/") def do_index(): return flask.redirect(flask.url_for("do_search",term="")) @app.route("/download/file/") def do_download_file(docnum): document = index.reader().stored_fields(docnum) r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) r.headers.add('Content-Disposition', 'attachment', filename=os.path.basename(document['path'])) return r @app.route("/download/page//", methods=["GET"]) def do_download_page(docnum,page): document = index.reader().stored_fields(docnum) inputfile = pyPdf.PdfFileReader(file(document['path'], 'r')) page = inputfile.getPage(page) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() outfile.addPage(page) outfile.write(outbuf) outbuf.seek(0) r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(document['path'])[:-3] client_filename += u".Page-{0}".format(document['pagenumber']) r.headers.add('Content-Disposition', 'attachment', filename=client_filename) return r class MyHtmlFormatter(highlight.HtmlFormatter): def _format_fragment(self, text, fragment, seen): text = unicode( flask.Markup.escape(text) ) return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen) @app.route("/search/skip=/",methods=["GET"]) @app.route("/search/",methods=["GET"]) @app.route("/search/", methods=["GET"]) def do_search(skip=0,term=None): if skip == 0 and not term: return flask.render_template('search.html', objects=[], term="", skip=0) query = QueryParser("content").parse(term) results = searcher.search(query, limit=1001, sortedby="path") terms = [text for fieldname, text in query.all_terms() if fieldname == "content"] matches = [] for result in results: title = result.get("title") path = result.get("path") excerpt = highlight.highlight(result.get("content"), terms, analysis.StandardAnalyzer(), highlight.SimpleFragmenter(), MyHtmlFormatter()) matches.append( {'path':result.get('path'), 'excerpt':excerpt, 'docnum':result.docnum, 'pagenumber':result.get('pagenumber'), 'title':title }) match_groups = {} for match in matches: print match if not match_groups.has_key(match['path']): match_groups[match['path']] = { 'matches':[], 'title' : match['title'], 'first_docnum' : match['docnum'], 'filename':os.path.basename(match['path']) } match_groups[ match['path'] ]['matches'].append( { 'excerpt':match['excerpt'], 'docnum':match['docnum'], 'pagenumber':match['pagenumber'] }) objects = match_groups.values()[skip:skip+5] return flask.render_template('search.html', match_groups=objects, term=term, skip=skip, resultlen=len(results)) if __name__ == "__main__": app.debug = True app.run(host="0.0.0.0")