From 1f408d58a0853653d9297bd048512c3e4e279512 Mon Sep 17 00:00:00 2001 From: yvesf Date: Mon, 22 Nov 2010 00:52:52 +0100 Subject: index/seaerch/convert works; ugly template --- static/style.css | 5 +- templates/search.html | 13 +++-- web.py | 131 +++++++++++++++++++++++--------------------------- 3 files changed, 69 insertions(+), 80 deletions(-) diff --git a/static/style.css b/static/style.css index 11bcece..15365f6 100644 --- a/static/style.css +++ b/static/style.css @@ -1,5 +1,6 @@ div#header { font-size: 150%; +b } div#search { @@ -15,10 +16,6 @@ div#content div#navigation { } div#footer { - position: fixed; - bottom: 0px; - padding: 4px; - background-color: white; left: 0px; right: 0px; } diff --git a/templates/search.html b/templates/search.html index 4afe479..81a1007 100644 --- a/templates/search.html +++ b/templates/search.html @@ -13,15 +13,20 @@ Matched {{ matches.__len__() }} Book{% if matches.__len__() > 1 %}s{% endif %} {% for docnum, matches in matches.items() %}
- book: {{ docnum }} - + book: {{ docnum }} + +
{% for match in matches %}
- {{ match }} + Match at page {{ match[2] }} ( + image, + pdf) + score={{ match[0] }} + excerpt
{% endfor %}
- match in {{ docnum }} +
{% endfor %} {% endif %} {% endblock %} diff --git a/web.py b/web.py index 077aa22..edb4a34 100644 --- a/web.py +++ b/web.py @@ -24,29 +24,25 @@ index_metadata = open_dir(u"index", u"metadata") def do_index(): return flask.redirect(flask.url_for("do_search",term="")) -@app.route("/download/file/") -def do_download_file(docnum): - document = index.reader().stored_fields(docnum) - r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) - r.headers.add('Content-Disposition', 'attachment', - filename=os.path.basename(document['path'])) - return r - -@app.route("/download/image/", methods=["GET"]) -@app.route("/download/image//", methods=["GET"]) -def do_download_image(docnum,page=0): +@app.route("/book/file/") +def do_book_file(docnum): + with index_metadata.reader() as reader: + document = reader.stored_fields(docnum) + r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) + r.headers.add('Content-Disposition', 'attachment', + filename=os.path.basename(document['path'])) + return r + + +def pdf_to_image(filepath, page): + print page def generator(process, input): input.seek(0) while not process.stdin.closed: stdout, stderr = process.communicate(input.read()) if stdout: yield stdout - - with index_metadata.reader() as reader: - document = reader.stored_fields(docnum) - filepath = document['path'] pdffile = StringIO() - page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page) out = pyPdf.PdfFileWriter() out.addPage(page) @@ -55,19 +51,35 @@ def do_download_image(docnum,page=0): stdin=subprocess.PIPE, stdout=subprocess.PIPE) return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg") -@app.route("/download/page//", methods=["GET"]) -def do_download_page(docnum,page): - document = index.reader().stored_fields(docnum) - inputfile = pyPdf.PdfFileReader(file(document['path'], 'r')) - page = inputfile.getPage(page) +@app.route("/page/image/", methods=["GET"]) +def do_page_image(docnum): + with index_book.reader() as reader: + document = reader.stored_fields(docnum) + return pdf_to_image(document['path'], document['pagenumber']) + + +@app.route("/book/frontpage/", methods=["GET"]) +def do_book_frontpage(docnum): + with index_metadata.reader() as reader: + document = reader.stored_fields(docnum) + return pdf_to_image(document['path'], 0) + +@app.route("/page/file/", methods=["GET"]) +def do_page_file(docnum): + with index_book.reader() as reader: + document = reader.stored_fields(docnum) + filepath = document['path'] + page = document['pagenumber'] + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + pdfpage = inputfile.getPage(page) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() - outfile.addPage(page) + outfile.addPage(pdfpage) outfile.write(outbuf) outbuf.seek(0) r = werkzeug.Response(outbuf, mimetype="application/pdf") - client_filename = os.path.basename(document['path'])[:-3] - client_filename += u".Page-{0}".format(document['pagenumber']) + client_filename = os.path.basename(filepath)[:-3] + client_filename += u".Page-{0}".format(page) r.headers.add('Content-Disposition', 'attachment', filename=client_filename) return r @@ -78,8 +90,6 @@ class MyHtmlFormatter(highlight.HtmlFormatter): @app.route("/json/excerpt//", methods=["GET"]) def json_excerpt(docnum, term): - searcher = index.searcher() - def generator(q): for result in searcher.search(q, limit=1, sortedby="pagenumber"): terms = [ text for fieldname, text in q.all_terms() @@ -94,10 +104,17 @@ def json_excerpt(docnum, term): 'excerpt':excerpt, 'docnum':result.docnum} ) - document = searcher.reader().stored_fields(docnum) + with index_book.reader() as reader: + document = reader.stored_fields(docnum) q = QueryParser("content").parse(term) - q = query.And([ q, query.Term("path", document['path']) ] ) - return werkzeug.Response( generator(q) ) + terms = [ text for fieldname, text in q.all_terms() + if fieldname == "content" ] + excerpt = highlight.highlight(document['content'], + terms, + analysis.FancyAnalyzer(), + highlight.SimpleFragmenter(), + MyHtmlFormatter()) + return unicode( excerpt ) @app.route("/search/skip=/",methods=["GET"]) @app.route("/search/",methods=["GET"]) @@ -110,54 +127,24 @@ def do_search(skip=0,term=None): query = QueryParser("content").parse(term) facets = searching.Facets.from_field(searcher, "path") results = searcher.search(query, limit=None) - categories = facets.categorize(results).items() + for result in results: + print result.get("pagenumber") + categories = facets.categorize(results) searcher.close() matches = {} - for book in categories: - filepath = book[0] - with index_metadata.searcher() as searcher: - docnum = searcher.document_number(path=filepath) - matches[docnum] = [] - for match in book[1]: - matches[docnum].append(match) + with index_book.reader() as reader: + for book in categories.items(): + filepath = book[0] + with index_metadata.searcher() as searcher: + docnum = searcher.document_number(path=filepath) + matches[docnum] = [] + for match in book[1]: + pagenumber = reader.stored_fields(match[0])['pagenumber'] + match = (match[0], match[1], pagenumber) + matches[docnum].append(match) return flask.render_template('search.html', matches=matches, term=term) - """ - terms = [text for fieldname, text in query.all_terms() - if fieldname == "content"] - - matches = [] - for result in results: - title = result.get("title") - path = result.get("path") - excerpt = highlight.highlight(result.get("content"), - terms, - analysis.StandardAnalyzer(), - highlight.SimpleFragmenter(), - MyHtmlFormatter()) - matches.append( {'path':result.get('path'), - 'excerpt':excerpt, - 'docnum':result.docnum, - 'pagenumber':result.get('pagenumber'), - 'title':title }) - match_groups = {} - for match in matches: - print match - if not match_groups.has_key(match['path']): - match_groups[match['path']] = { 'matches':[], - 'title' : match['title'], - 'first_docnum' : match['docnum'], - 'filename':os.path.basename(match['path']) } - match_groups[ match['path'] ]['matches'].append( - { 'excerpt':match['excerpt'], - 'docnum':match['docnum'], - 'pagenumber':match['pagenumber'] }) - - objects = match_groups.values()[skip:skip+5] - return flask.render_template('search.html', - match_groups=objects, term=term, skip=skip, resultlen=len(results)) - """ if __name__ == "__main__": app.debug = True -- cgit v1.2.1