summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--static/style.css5
-rw-r--r--templates/search.html13
-rw-r--r--web.py131
3 files changed, 69 insertions, 80 deletions
diff --git a/static/style.css b/static/style.css
index 11bcece..15365f6 100644
--- a/static/style.css
+++ b/static/style.css
@@ -1,5 +1,6 @@
div#header {
font-size: 150%;
+b
}
div#search {
@@ -15,10 +16,6 @@ div#content div#navigation {
}
div#footer {
- position: fixed;
- bottom: 0px;
- padding: 4px;
- background-color: white;
left: 0px;
right: 0px;
}
diff --git a/templates/search.html b/templates/search.html
index 4afe479..81a1007 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -13,15 +13,20 @@
Matched {{ matches.__len__() }} Book{% if matches.__len__() > 1 %}s{% endif %}
{% for docnum, matches in matches.items() %}
<div class="book">
- book: {{ docnum }}
- <img src="{{ url_for("do_download_image", docnum=docnum, page=0) }}"/>
+ <a href="{{ url_for("do_book_file", docnum=docnum) }}">book: {{ docnum }}</a>
+ <img src="{{ url_for("do_book_frontpage", docnum=docnum) }}"/>
+ <br />
{% for match in matches %}
<div class="match">
- {{ match }}
+ Match at page {{ match[2] }} (
+ <a href="{{ url_for("do_page_image", docnum=match[0]) }}">image</a>,
+ <a href="{{ url_for("do_page_file", docnum=match[0]) }}">pdf</a>)
+ score={{ match[0] }}
+ <a href="{{ url_for("json_excerpt", docnum=match[0], term=term) }}">excerpt</a>
</div>
{% endfor %}
</div>
- match in {{ docnum }}
+ <hr />
{% endfor %}
{% endif %}
{% endblock %}
diff --git a/web.py b/web.py
index 077aa22..edb4a34 100644
--- a/web.py
+++ b/web.py
@@ -24,29 +24,25 @@ index_metadata = open_dir(u"index", u"metadata")
def do_index():
return flask.redirect(flask.url_for("do_search",term=""))
-@app.route("/download/file/<int:docnum>")
-def do_download_file(docnum):
- document = index.reader().stored_fields(docnum)
- r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
- r.headers.add('Content-Disposition', 'attachment',
- filename=os.path.basename(document['path']))
- return r
-
-@app.route("/download/image/<int:docnum>", methods=["GET"])
-@app.route("/download/image/<int:docnum>/<int:page>", methods=["GET"])
-def do_download_image(docnum,page=0):
+@app.route("/book/file/<int:docnum>")
+def do_book_file(docnum):
+ with index_metadata.reader() as reader:
+ document = reader.stored_fields(docnum)
+ r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
+ r.headers.add('Content-Disposition', 'attachment',
+ filename=os.path.basename(document['path']))
+ return r
+
+
+def pdf_to_image(filepath, page):
+ print page
def generator(process, input):
input.seek(0)
while not process.stdin.closed:
stdout, stderr = process.communicate(input.read())
if stdout:
yield stdout
-
- with index_metadata.reader() as reader:
- document = reader.stored_fields(docnum)
- filepath = document['path']
pdffile = StringIO()
-
page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page)
out = pyPdf.PdfFileWriter()
out.addPage(page)
@@ -55,19 +51,35 @@ def do_download_image(docnum,page=0):
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg")
-@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"])
-def do_download_page(docnum,page):
- document = index.reader().stored_fields(docnum)
- inputfile = pyPdf.PdfFileReader(file(document['path'], 'r'))
- page = inputfile.getPage(page)
+@app.route("/page/image/<int:docnum>", methods=["GET"])
+def do_page_image(docnum):
+ with index_book.reader() as reader:
+ document = reader.stored_fields(docnum)
+ return pdf_to_image(document['path'], document['pagenumber'])
+
+
+@app.route("/book/frontpage/<int:docnum>", methods=["GET"])
+def do_book_frontpage(docnum):
+ with index_metadata.reader() as reader:
+ document = reader.stored_fields(docnum)
+ return pdf_to_image(document['path'], 0)
+
+@app.route("/page/file/<int:docnum>", methods=["GET"])
+def do_page_file(docnum):
+ with index_book.reader() as reader:
+ document = reader.stored_fields(docnum)
+ filepath = document['path']
+ page = document['pagenumber']
+ inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ pdfpage = inputfile.getPage(page)
outbuf = StringIO()
outfile = pyPdf.PdfFileWriter()
- outfile.addPage(page)
+ outfile.addPage(pdfpage)
outfile.write(outbuf)
outbuf.seek(0)
r = werkzeug.Response(outbuf, mimetype="application/pdf")
- client_filename = os.path.basename(document['path'])[:-3]
- client_filename += u".Page-{0}".format(document['pagenumber'])
+ client_filename = os.path.basename(filepath)[:-3]
+ client_filename += u".Page-{0}".format(page)
r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
return r
@@ -78,8 +90,6 @@ class MyHtmlFormatter(highlight.HtmlFormatter):
@app.route("/json/excerpt/<int:docnum>/<path:term>", methods=["GET"])
def json_excerpt(docnum, term):
- searcher = index.searcher()
-
def generator(q):
for result in searcher.search(q, limit=1, sortedby="pagenumber"):
terms = [ text for fieldname, text in q.all_terms()
@@ -94,10 +104,17 @@ def json_excerpt(docnum, term):
'excerpt':excerpt,
'docnum':result.docnum} )
- document = searcher.reader().stored_fields(docnum)
+ with index_book.reader() as reader:
+ document = reader.stored_fields(docnum)
q = QueryParser("content").parse(term)
- q = query.And([ q, query.Term("path", document['path']) ] )
- return werkzeug.Response( generator(q) )
+ terms = [ text for fieldname, text in q.all_terms()
+ if fieldname == "content" ]
+ excerpt = highlight.highlight(document['content'],
+ terms,
+ analysis.FancyAnalyzer(),
+ highlight.SimpleFragmenter(),
+ MyHtmlFormatter())
+ return unicode( excerpt )
@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
@@ -110,54 +127,24 @@ def do_search(skip=0,term=None):
query = QueryParser("content").parse(term)
facets = searching.Facets.from_field(searcher, "path")
results = searcher.search(query, limit=None)
- categories = facets.categorize(results).items()
+ for result in results:
+ print result.get("pagenumber")
+ categories = facets.categorize(results)
searcher.close()
matches = {}
- for book in categories:
- filepath = book[0]
- with index_metadata.searcher() as searcher:
- docnum = searcher.document_number(path=filepath)
- matches[docnum] = []
- for match in book[1]:
- matches[docnum].append(match)
+ with index_book.reader() as reader:
+ for book in categories.items():
+ filepath = book[0]
+ with index_metadata.searcher() as searcher:
+ docnum = searcher.document_number(path=filepath)
+ matches[docnum] = []
+ for match in book[1]:
+ pagenumber = reader.stored_fields(match[0])['pagenumber']
+ match = (match[0], match[1], pagenumber)
+ matches[docnum].append(match)
return flask.render_template('search.html', matches=matches, term=term)
- """
- terms = [text for fieldname, text in query.all_terms()
- if fieldname == "content"]
-
- matches = []
- for result in results:
- title = result.get("title")
- path = result.get("path")
- excerpt = highlight.highlight(result.get("content"),
- terms,
- analysis.StandardAnalyzer(),
- highlight.SimpleFragmenter(),
- MyHtmlFormatter())
- matches.append( {'path':result.get('path'),
- 'excerpt':excerpt,
- 'docnum':result.docnum,
- 'pagenumber':result.get('pagenumber'),
- 'title':title })
- match_groups = {}
- for match in matches:
- print match
- if not match_groups.has_key(match['path']):
- match_groups[match['path']] = { 'matches':[],
- 'title' : match['title'],
- 'first_docnum' : match['docnum'],
- 'filename':os.path.basename(match['path']) }
- match_groups[ match['path'] ]['matches'].append(
- { 'excerpt':match['excerpt'],
- 'docnum':match['docnum'],
- 'pagenumber':match['pagenumber'] })
-
- objects = match_groups.values()[skip:skip+5]
- return flask.render_template('search.html',
- match_groups=objects, term=term, skip=skip, resultlen=len(results))
- """
if __name__ == "__main__":
app.debug = True