#!/usr/bin/python2.6 # coding: utf-8 import os import json from StringIO import StringIO from whoosh.index import open_dir from whoosh.qparser import QueryParser import whoosh.searching as searching import whoosh.analysis as analysis import whoosh.highlight as highlight import whoosh.query as query import flask import pyPdf import werkzeug import subprocess app = flask.Flask("booksearch") index_book = open_dir(u"index", u"book") index_metadata = open_dir(u"index", u"metadata") @app.route("/") def do_index(): return flask.redirect(flask.url_for("do_search",term="")) @app.route("/book/file/") def do_book_file(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) r.headers.add('Content-Disposition', 'attachment', filename=os.path.basename(document['path'])) return r def pdf_to_image(filepath, page): def generator(process, input): input.seek(0) while not process.stdin.closed: stdout, stderr = process.communicate(input.read()) if stdout: yield stdout pdffile = StringIO() page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page-1) out = pyPdf.PdfFileWriter() out.addPage(page) out.write(pdffile) process = subprocess.Popen(["/usr/bin/convert", "-resize", "260x", "pdf:-", "jpeg:-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg") @app.route("/page/image/", methods=["GET"]) def do_page_image(docnum): with index_book.reader() as reader: document = reader.stored_fields(docnum) return pdf_to_image(document['path'], document['pagenumber']) @app.route("/book/frontpage/", methods=["GET"]) def do_book_frontpage(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) return pdf_to_image(document['path'], 0) @app.route("/page/file/", methods=["GET"]) def do_page_file(docnum): with index_book.reader() as reader: document = reader.stored_fields(docnum) filepath = document['path'] page = document['pagenumber'] inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) pdfpage = inputfile.getPage(page-1) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() outfile.addPage(pdfpage) outfile.write(outbuf) outbuf.seek(0) r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(filepath)[:-3] client_filename += u".Page-{0}.pdf".format(page) r.headers.add('Content-Disposition', 'attachment', filename=client_filename) return r class MyHtmlFormatter(highlight.HtmlFormatter): def _format_fragment(self, text, fragment, seen): text = unicode( flask.Markup.escape(text) ) return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen) @app.route("/excerpt//", methods=["GET"]) def excerpt(docnum, term): def generator(q): for result in searcher.search(q, limit=1, sortedby="pagenumber"): terms = [ text for fieldname, text in q.all_terms() if fieldname == "content" ] excerpt = highlight.highlight(result.get("content"), terms, analysis.StandardAnalyzer(), highlight.ContextFragmenter(terms, maxchars=400, charsbefore=40, charsafter=40), #highlight.SentenceFragmenter(maxchars=500), MyHtmlFormatter()) yield json.dumps( { 'pagenumber':result.get("pagenumber"), 'excerpt':excerpt, 'docnum':result.docnum} ) with index_book.reader() as reader: document = reader.stored_fields(docnum) q = QueryParser("content").parse(term) terms = [ text for fieldname, text in q.all_terms() if fieldname == "content" ] excerpt = highlight.highlight(document['content'], terms, analysis.StandardAnalyzer(), highlight.SimpleFragmenter(), MyHtmlFormatter()) return unicode( excerpt ) @app.route("/search/",methods=["GET"]) @app.route("/search/", methods=["GET"]) def do_search(term=None): if not term: return flask.render_template('search.html', objects=[], term="") searcher = index_book.searcher() query = QueryParser("content").parse(term) facets = searching.Facets.from_field(searcher, "path") results = searcher.search(query, limit=None, sortedby="pagenumber") categories = facets.categorize(results) searcher.close() matches = {} with index_book.reader() as reader: for book in categories.items(): filepath = book[0] with index_metadata.searcher() as searcher: docnum = searcher.document_number(path=filepath) matches[docnum] = [] for match in book[1]: pagenumber = reader.stored_fields(match[0])['pagenumber'] match = (match[0], match[1], pagenumber) matches[docnum].append(match) return flask.render_template('search.html', matches=matches, term=term) def log_response(sender, response): sender.logger.debug('Request context is about to close down. ' 'Response: %s', response) if __name__ == "__main__": app.debug = True app.run(host="0.0.0.0")