summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-20 01:58:53 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-20 01:58:53 +0100
commit966a17b12c9deab35ef0a804d9fa1faea9c8042d (patch)
tree7ba80df4fcfaaabded6e82a48d139e48e4978952
parent0ae1e5e802871903d73d6542252aa0a8d08fba39 (diff)
downloadbooksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.tar.gz
booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.zip
page based index; download page; download file
-rw-r--r--index.py12
-rw-r--r--templates/search.html21
-rw-r--r--web.py46
3 files changed, 65 insertions, 14 deletions
diff --git a/index.py b/index.py
index 3758233..5a982a9 100644
--- a/index.py
+++ b/index.py
@@ -13,6 +13,7 @@ from threading import Thread, Condition
schema = fields.Schema(
title=fields.TEXT(stored=True),
path=fields.ID(stored=True),
+ pagenumber=fields.NUMERIC(stored=True),
content=fields.TEXT(stored=True),
createtime=fields.NUMERIC() )
@@ -48,10 +49,11 @@ class PDFWorker(Thread):
print u"{0} processing {1}".format(self.name, filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
- content = u""
+ pagenumber = 0
for page in inputfile.pages:
- content += page.extractText()
- documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } )
+ pagenumber += 1
+ content = page.extractText()
+ documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
except Exception, e:
print u"{0} Exception: {1}".format(self.name, str(e))
finally:
@@ -65,12 +67,12 @@ class IndexWorker(Thread):
doc = documents.get(True, 0.5)
except Empty:
continue
- print u"{0} adding {1}".format(self.name, doc['path'])
+ print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
writer = index.writer()
writer.add_document(**doc)
writer.commit()
documents.task_done()
- print u"{0} added {1}".format(self.name, doc['path'])
+ print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
threads = map(lambda i: PDFWorker(), range(1))
for thread in threads:
diff --git a/templates/search.html b/templates/search.html
index e6bfe07..0c5f3d6 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -1,14 +1,33 @@
<html>
<head>
<title>{{ objects.__len__() + skip}} matches</title>
+<script>
+function start() {
+var inputField = document.getElementById("search");
+window.location = "/search/" + inputField.value;
+}
+</script>
</head>
<body>
+<div>
+<input id="search" type="text"/>
+<button onclick="start()">Go</button>
+</div>
{% if objects.__len__() == 0 %}
No Matches
{% else %}
{% for obj in objects %}
<hr />
- <h2> {{ obj['title'] }} </h2>
+ <h2>
+ {{ obj['title'] }}
+ <a href="{{ url_for("do_download_page", docnum=obj['docnum']) }}">
+ This Page
+ </a>
+ -
+ <a href="{{ url_for("do_download_file", docnum=obj['docnum']) }}">
+ File
+ </a>
+ </h2>
<pre> {{ obj['path'] }} </pre>
{% autoescape false %}
<div>{{ obj['excerpt'] }}</div>
diff --git a/web.py b/web.py
index 429c516..b747010 100644
--- a/web.py
+++ b/web.py
@@ -7,16 +7,45 @@ import whoosh.analysis as analysis
from whoosh import highlight
import flask
from flask import Flask
-
+import pyPdf
+from StringIO import StringIO
+import werkzeug
app = Flask("booksearch")
index = open_dir(u"index", mapped=False)
searcher = index.searcher()
+@app.route("/")
+def do_index():
+ return flask.redirect(flask.url_for("do_search",term=""))
+
+@app.route("/download/file/<int:docnum>")
+def do_download_file(docnum):
+ document = index.reader().stored_fields(docnum)
+ filepath = document['path']
+ return werkzeug.Response(open(filepath, "r"), mimetype="application/pdf")
+
+@app.route("/download/page/<int:docnum>", methods=["GET"])
+def do_download_page(docnum):
+ document = index.reader().stored_fields(docnum)
+ filepath = document['path']
+ pagenumber = document['pagenumber']
+ inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ page = inputfile.getPage(pagenumber)
+ outbuf = StringIO()
+ outfile = pyPdf.PdfFileWriter()
+ outfile.addPage(page)
+ outfile.write(outbuf)
+ outbuf.seek(0)
+ return werkzeug.Response(outbuf, mimetype="application/pdf")
@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
+@app.route("/search/", methods=["GET"])
def do_search(skip=0,term=None):
+ if skip == 0 and not term:
+ return flask.render_template('search.html', objects=[], term="", skip=0)
+
query = QueryParser("content").parse(term)
results = searcher.search(query, limit=skip+5)
@@ -26,14 +55,15 @@ def do_search(skip=0,term=None):
for result in results[skip:skip+5]:
title = result.get("title")
path = result.get("path")
-# high = highlight.highlight(result.get("content"),
-# terms,
-# analysis.StandardAnalyzer(),
-# highlight.SimpleFragmenter(),
-# highlight.HtmlFormatter())
- objects.append({ 'title' : title, 'path' : path, 'excerpt' : 'TODO' })
- return flask.render_template('search.html', objects=objects, term=term, skip=skip)
+ print path
+ high = highlight.highlight(result.get("content"),
+ terms,
+ analysis.StandardAnalyzer(),
+ highlight.SimpleFragmenter(),
+ highlight.HtmlFormatter())
+ objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum })
+ return flask.render_template('search.html', objects=objects, term=term, skip=skip)
if __name__ == "__main__":
app.debug = True