From 15f17b4fce5a1e4b70709aac569c18d8cf57deba Mon Sep 17 00:00:00 2001 From: yvesf Date: Fri, 26 Nov 2010 23:59:59 +0100 Subject: add pdfminer for text extraction; replaced all ' with " metadata hacking --- web.py | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'web.py') diff --git a/web.py b/web.py index b675c2c..7f8c0e5 100644 --- a/web.py +++ b/web.py @@ -33,9 +33,9 @@ def do_index(): def do_book_file(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) - r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) - r.headers.add('Content-Disposition', 'attachment', - filename=os.path.basename(document['path'])) + r = werkzeug.Response(open(document["path"], "r"), mimetype="application/pdf",) + r.headers.add("Content-Disposition", "attachment", + filename=os.path.basename(document["path"])) return r @@ -53,7 +53,7 @@ def pdf_to_image(filepath, page, size): if stdout: yield stdout pdffile = StringIO() - page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page) + page = pyPdf.PdfFileReader(file(filepath, "r")).getPage(page) out = pyPdf.PdfFileWriter() out.addPage(page) out.write(pdffile) @@ -66,24 +66,24 @@ def pdf_to_image(filepath, page, size): def do_page_image(docnum,size=260): with index_book.reader() as reader: document = reader.stored_fields(docnum) - page = document['pagenumber'] - 1 - return pdf_to_image(document['path'], page, size=size) + page = document["pagenumber"] - 1 + return pdf_to_image(document["path"], page, size=size) @app.route("/book/frontpage/", methods=["GET"]) def do_book_frontpage(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) - return pdf_to_image(document['path'], 0, 260) + return pdf_to_image(document["path"], 0, 260) @app.route("/page/file/", methods=["GET"]) def do_page_file(docnum): with index_book.reader() as reader: document = reader.stored_fields(docnum) - filepath = document['path'] - page = document['pagenumber'] - 1 + filepath = document["path"] + page = document["pagenumber"] - 1 app.logger.debug(str_format("Extract page={page} from filepath={filepath}", page=page, filepath=filepath)) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + inputfile = pyPdf.PdfFileReader(file(filepath, "r")) pdfpage = inputfile.getPage(page) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() @@ -93,7 +93,7 @@ def do_page_file(docnum): r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(filepath)[:-3] client_filename += str_format(u".Page-{page}.pdf", page=page) - r.headers.add('Content-Disposition', 'attachment', filename=client_filename) + r.headers.add("Content-Disposition", "attachment", filename=client_filename) return r class MyHtmlFormatter(highlight.HtmlFormatter): @@ -110,10 +110,10 @@ def do_excerpt(docnum, term): q = q.simplify(reader) terms = [ text for fieldname, text in q.all_terms() if fieldname == "content" ] - excerpt = highlight.highlight(document['content'], + excerpt = highlight.highlight(document["content"], terms, analysis.StandardAnalyzer(), - highlight.SimpleFragmenter(), + highlight.ContextFragmenter(terms, surround=40), MyHtmlFormatter()) return unicode( excerpt ) @@ -121,7 +121,7 @@ def do_excerpt(docnum, term): @app.route("/search/", methods=["GET"]) def do_search(term=None): if not term: - return flask.render_template('search.html', objects=[], term="") + return flask.render_template("search.html", objects=[], term="") term = term.lower() searcher = index_book.searcher() @@ -138,23 +138,23 @@ def do_search(term=None): with index_metadata.searcher() as searcher: docnum = searcher.document_number(path=filepath) with index_metadata.reader() as reader2: - title = reader2.stored_fields(docnum).get('title') + title = reader2.stored_fields(docnum).get("title") books[docnum] = { - 'matches' : [], - 'title':title, - 'filename' : os.path.basename(filepath), + "matches" : [], + "title":title, + "filename" : os.path.basename(filepath), } for match in book[1]: - pagenumber = reader.stored_fields(match[0])['pagenumber'] + pagenumber = reader.stored_fields(match[0])["pagenumber"] match = (match[0], match[1], pagenumber) - books[docnum]['matches'].append(match) + books[docnum]["matches"].append(match) - return flask.render_template('search.html', + return flask.render_template("search.html", books=books, term=term) def log_response(sender, response): - sender.logger.debug('Request context is about to close down. ' - 'Response: %s', response) + sender.logger.debug("Request context is about to close down. " + "Response: %s", response) if __name__ == "__main__": app.run(host="0.0.0.0", port=8000) -- cgit v1.2.1