#!/usr/bin/python2.6 # coding: utf-8 import os import json from StringIO import StringIO from whoosh.index import open_dir from whoosh.qparser import QueryParser import whoosh.searching as searching import whoosh.analysis as analysis import whoosh.highlight as highlight import whoosh.query as query import flask import pyPdf import werkzeug import subprocess app = flask.Flask("booksearch") index_book = open_dir(u"index", u"book") index_metadata = open_dir(u"index", u"metadata") @app.route("/") def do_index(): return flask.redirect(flask.url_for("do_search",term="")) @app.route("/book/file/") def do_book_file(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) r.headers.add('Content-Disposition', 'attachment', filename=os.path.basename(document['path'])) return r def pdf_to_image(filepath, page, size): if not type(size) == int or size < 1 or size > 2000: size = 260 density = 60 + 0.15 * size app.logger.debug("Convert PDF to image page={0} size={1} density={2} filepath={3}".format(page, size, density, filepath)) def generator(process, input): input.seek(0) while not process.stdin.closed: stdout, stderr = process.communicate(input.read()) if stdout: yield stdout pdffile = StringIO() page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page) out = pyPdf.PdfFileWriter() out.addPage(page) out.write(pdffile) process = subprocess.Popen(["/usr/bin/convert", "-density", "{0}".format(density), "-resize", "{0}x".format(size), "pdf:-", "jpeg:-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg") @app.route("/page/image/", methods=["GET"]) @app.route("/page/image//", methods=["GET"]) def do_page_image(docnum,size=260): with index_book.reader() as reader: document = reader.stored_fields(docnum) page = document['pagenumber'] - 1 return pdf_to_image(document['path'], page, size=size) @app.route("/book/frontpage/", methods=["GET"]) def do_book_frontpage(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) return pdf_to_image(document['path'], 0, 260) @app.route("/page/file/", methods=["GET"]) def do_page_file(docnum): with index_book.reader() as reader: document = reader.stored_fields(docnum) filepath = document['path'] page = document['pagenumber'] - 1 app.logger.debug("Extract page={0} from filepath={1}".format(page, filepath) ) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) pdfpage = inputfile.getPage(page) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() outfile.addPage(pdfpage) outfile.write(outbuf) outbuf.seek(0) r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(filepath)[:-3] client_filename += u".Page-{0}.pdf".format(page) r.headers.add('Content-Disposition', 'attachment', filename=client_filename) return r class MyHtmlFormatter(highlight.HtmlFormatter): def _format_fragment(self, text, fragment, seen): text = unicode( flask.Markup.escape(text) ) return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen) @app.route("/excerpt//", methods=["GET"]) def excerpt(docnum, term): def generator(q): for result in searcher.search(q, limit=1, sortedby="pagenumber"): terms = [ text for fieldname, text in q.all_terms() if fieldname == "content" ] excerpt = highlight.highlight(result.get("content"), terms, analysis.StandardAnalyzer(), highlight.ContextFragmenter(terms, maxchars=400, charsbefore=40, charsafter=40), #highlight.SentenceFragmenter(maxchars=500), MyHtmlFormatter()) yield json.dumps( { 'pagenumber':result.get("pagenumber"), 'excerpt':excerpt, 'docnum':result.docnum} ) with index_book.reader() as reader: document = reader.stored_fields(docnum) q = QueryParser("content").parse(term) terms = [ text for fieldname, text in q.all_terms() if fieldname == "content" ] excerpt = highlight.highlight(document['content'], terms, analysis.StandardAnalyzer(), highlight.SimpleFragmenter(), MyHtmlFormatter()) return unicode( excerpt ) @app.route("/search/",methods=["GET"]) @app.route("/search/", methods=["GET"]) def do_search(term=None): if not term: return flask.render_template('search.html', objects=[], term="") searcher = index_book.searcher() query = QueryParser("content").parse(term) facets = searching.Facets.from_field(searcher, "path") results = searcher.search(query, limit=None, sortedby="pagenumber") categories = facets.categorize(results) searcher.close() books = {} with index_book.reader() as reader: for book in categories.items(): filepath = book[0] with index_metadata.searcher() as searcher: docnum = searcher.document_number(path=filepath) with index_metadata.reader() as reader2: title = reader2.stored_fields(docnum).get('title') books[docnum] = { 'matches' : [], 'title':title, 'filename' : os.path.basename(filepath), } for match in book[1]: pagenumber = reader.stored_fields(match[0])['pagenumber'] match = (match[0], match[1], pagenumber) books[docnum]['matches'].append(match) return flask.render_template('search.html', books=books, term=term) def log_response(sender, response): sender.logger.debug('Request context is about to close down. ' 'Response: %s', response) if __name__ == "__main__": app.debug = True app.run(host="0.0.0.0")