summaryrefslogtreecommitdiff
path: root/web.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-26 23:59:59 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-26 23:59:59 +0100
commit15f17b4fce5a1e4b70709aac569c18d8cf57deba (patch)
tree89eebcc47aebed71cc86eedfaee8cf894e47c07e /web.py
parent9122cfed447a643a8c383be5558fd82fc3be7cc7 (diff)
downloadbooksearch-15f17b4fce5a1e4b70709aac569c18d8cf57deba.tar.gz
booksearch-15f17b4fce5a1e4b70709aac569c18d8cf57deba.zip
add pdfminer for text extraction;
replaced all ' with " metadata hacking
Diffstat (limited to 'web.py')
-rw-r--r--web.py46
1 files changed, 23 insertions, 23 deletions
diff --git a/web.py b/web.py
index b675c2c..7f8c0e5 100644
--- a/web.py
+++ b/web.py
@@ -33,9 +33,9 @@ def do_index():
def do_book_file(docnum):
with index_metadata.reader() as reader:
document = reader.stored_fields(docnum)
- r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
- r.headers.add('Content-Disposition', 'attachment',
- filename=os.path.basename(document['path']))
+ r = werkzeug.Response(open(document["path"], "r"), mimetype="application/pdf",)
+ r.headers.add("Content-Disposition", "attachment",
+ filename=os.path.basename(document["path"]))
return r
@@ -53,7 +53,7 @@ def pdf_to_image(filepath, page, size):
if stdout:
yield stdout
pdffile = StringIO()
- page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page)
+ page = pyPdf.PdfFileReader(file(filepath, "r")).getPage(page)
out = pyPdf.PdfFileWriter()
out.addPage(page)
out.write(pdffile)
@@ -66,24 +66,24 @@ def pdf_to_image(filepath, page, size):
def do_page_image(docnum,size=260):
with index_book.reader() as reader:
document = reader.stored_fields(docnum)
- page = document['pagenumber'] - 1
- return pdf_to_image(document['path'], page, size=size)
+ page = document["pagenumber"] - 1
+ return pdf_to_image(document["path"], page, size=size)
@app.route("/book/frontpage/<int:docnum>", methods=["GET"])
def do_book_frontpage(docnum):
with index_metadata.reader() as reader:
document = reader.stored_fields(docnum)
- return pdf_to_image(document['path'], 0, 260)
+ return pdf_to_image(document["path"], 0, 260)
@app.route("/page/file/<int:docnum>", methods=["GET"])
def do_page_file(docnum):
with index_book.reader() as reader:
document = reader.stored_fields(docnum)
- filepath = document['path']
- page = document['pagenumber'] - 1
+ filepath = document["path"]
+ page = document["pagenumber"] - 1
app.logger.debug(str_format("Extract page={page} from filepath={filepath}", page=page, filepath=filepath))
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ inputfile = pyPdf.PdfFileReader(file(filepath, "r"))
pdfpage = inputfile.getPage(page)
outbuf = StringIO()
outfile = pyPdf.PdfFileWriter()
@@ -93,7 +93,7 @@ def do_page_file(docnum):
r = werkzeug.Response(outbuf, mimetype="application/pdf")
client_filename = os.path.basename(filepath)[:-3]
client_filename += str_format(u".Page-{page}.pdf", page=page)
- r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
+ r.headers.add("Content-Disposition", "attachment", filename=client_filename)
return r
class MyHtmlFormatter(highlight.HtmlFormatter):
@@ -110,10 +110,10 @@ def do_excerpt(docnum, term):
q = q.simplify(reader)
terms = [ text for fieldname, text in q.all_terms()
if fieldname == "content" ]
- excerpt = highlight.highlight(document['content'],
+ excerpt = highlight.highlight(document["content"],
terms,
analysis.StandardAnalyzer(),
- highlight.SimpleFragmenter(),
+ highlight.ContextFragmenter(terms, surround=40),
MyHtmlFormatter())
return unicode( excerpt )
@@ -121,7 +121,7 @@ def do_excerpt(docnum, term):
@app.route("/search/", methods=["GET"])
def do_search(term=None):
if not term:
- return flask.render_template('search.html', objects=[], term="")
+ return flask.render_template("search.html", objects=[], term="")
term = term.lower()
searcher = index_book.searcher()
@@ -138,23 +138,23 @@ def do_search(term=None):
with index_metadata.searcher() as searcher:
docnum = searcher.document_number(path=filepath)
with index_metadata.reader() as reader2:
- title = reader2.stored_fields(docnum).get('title')
+ title = reader2.stored_fields(docnum).get("title")
books[docnum] = {
- 'matches' : [],
- 'title':title,
- 'filename' : os.path.basename(filepath),
+ "matches" : [],
+ "title":title,
+ "filename" : os.path.basename(filepath),
}
for match in book[1]:
- pagenumber = reader.stored_fields(match[0])['pagenumber']
+ pagenumber = reader.stored_fields(match[0])["pagenumber"]
match = (match[0], match[1], pagenumber)
- books[docnum]['matches'].append(match)
+ books[docnum]["matches"].append(match)
- return flask.render_template('search.html',
+ return flask.render_template("search.html",
books=books,
term=term)
def log_response(sender, response):
- sender.logger.debug('Request context is about to close down. '
- 'Response: %s', response)
+ sender.logger.debug("Request context is about to close down. "
+ "Response: %s", response)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8000)