page based index; download page; download file

author: yvesf <yvesf-git@xapek.org> 2010-11-20 01:58:53 +0100
committer: yvesf <yvesf-git@xapek.org> 2010-11-20 01:58:53 +0100
commit: 966a17b12c9deab35ef0a804d9fa1faea9c8042d (patch)
tree: 7ba80df4fcfaaabded6e82a48d139e48e4978952
parent: 0ae1e5e802871903d73d6542252aa0a8d08fba39 (diff)
download: booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.tar.gz
booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.zip
3 files changed, 65 insertions, 14 deletions
diff --git a/index.py b/index.py
index 3758233..5a982a9 100644
--- a/index.py
+++ b/index.py
@@ -13,6 +13,7 @@ from threading import Thread, Condition
 schema = fields.Schema(
     title=fields.TEXT(stored=True),
     path=fields.ID(stored=True),
+    pagenumber=fields.NUMERIC(stored=True),
     content=fields.TEXT(stored=True),
     createtime=fields.NUMERIC() )
 
@@ -48,10 +49,11 @@ class PDFWorker(Thread):
                 print u"{0} processing {1}".format(self.name, filepath)
                 inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
                 title = inputfile.getDocumentInfo().title
-                content = u""
+                pagenumber = 0
                 for page in inputfile.pages:
-                    content += page.extractText()
-                documents.put( {"title":title, "path":filepath, "content":content, "createtime":time.time() } )
+                    pagenumber += 1
+                    content = page.extractText()
+                    documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
             except Exception, e:
                 print u"{0} Exception: {1}".format(self.name, str(e))
             finally:
@@ -65,12 +67,12 @@ class IndexWorker(Thread):
                 doc = documents.get(True, 0.5)
             except Empty:
                 continue
-            print u"{0} adding     {1}".format(self.name, doc['path'])
+            print u"{0} adding     {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
             writer = index.writer()
             writer.add_document(**doc)
             writer.commit()
             documents.task_done()
-            print u"{0} added      {1}".format(self.name, doc['path'])
+            print u"{0} added      {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
 
 threads = map(lambda i: PDFWorker(), range(1))
 for thread in threads:
diff --git a/templates/search.html b/templates/search.html
index e6bfe07..0c5f3d6 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -1,14 +1,33 @@
 <html>
     <head>
         <title>{{ objects.__len__() + skip}} matches</title>
+<script>
+function start() {
+var inputField = document.getElementById("search");
+window.location = "/search/" + inputField.value;
+}
+</script>
     </head>
     <body>
+<div>
+<input id="search" type="text"/>
+<button onclick="start()">Go</button>
+</div>
         {% if objects.__len__() == 0 %}
             No Matches
         {% else %}
             {% for obj in objects %}
                 <hr />
-                <h2> {{ obj['title'] }} </h2>
+                <h2>
+                        {{ obj['title'] }}
+                    <a href="{{ url_for("do_download_page", docnum=obj['docnum']) }}">
+                        This Page
+                    </a>
+                    -
+                    <a href="{{ url_for("do_download_file", docnum=obj['docnum']) }}">
+                        File
+                    </a>
+                </h2>
                 <pre> {{ obj['path'] }} </pre>
                 {% autoescape false %}
                     <div>{{ obj['excerpt'] }}</div>
diff --git a/web.py b/web.py
index 429c516..b747010 100644
--- a/web.py
+++ b/web.py
@@ -7,16 +7,45 @@ import whoosh.analysis as analysis
 from whoosh import highlight
 import flask
 from flask import Flask
-
+import pyPdf
+from StringIO import StringIO
+import werkzeug
 app = Flask("booksearch")
 
 index = open_dir(u"index", mapped=False)
 searcher = index.searcher()
 
+@app.route("/")
+def do_index():
+    return flask.redirect(flask.url_for("do_search",term=""))
+
+@app.route("/download/file/<int:docnum>")
+def do_download_file(docnum):
+   document = index.reader().stored_fields(docnum)
+   filepath = document['path']
+   return werkzeug.Response(open(filepath, "r"), mimetype="application/pdf")
+
+@app.route("/download/page/<int:docnum>", methods=["GET"])
+def do_download_page(docnum):
+   document = index.reader().stored_fields(docnum)
+   filepath = document['path']
+   pagenumber = document['pagenumber']
+   inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+   page = inputfile.getPage(pagenumber)
+   outbuf = StringIO()
+   outfile = pyPdf.PdfFileWriter()
+   outfile.addPage(page)
+   outfile.write(outbuf)
+   outbuf.seek(0)
+   return werkzeug.Response(outbuf, mimetype="application/pdf")
 
 @app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
 @app.route("/search/<path:term>",methods=["GET"])
+@app.route("/search/", methods=["GET"])
 def do_search(skip=0,term=None):
+    if skip == 0 and not term:
+        return flask.render_template('search.html', objects=[], term="", skip=0)
+
     query = QueryParser("content").parse(term)
     results = searcher.search(query, limit=skip+5)
 
@@ -26,14 +55,15 @@ def do_search(skip=0,term=None):
     for result in results[skip:skip+5]:
         title = result.get("title")
         path = result.get("path")
-#        high = highlight.highlight(result.get("content"), 
-#            terms, 
-#            analysis.StandardAnalyzer(),
-#            highlight.SimpleFragmenter(),
-#            highlight.HtmlFormatter())
-        objects.append({ 'title' : title, 'path' : path, 'excerpt' : 'TODO' })
-    return flask.render_template('search.html', objects=objects, term=term, skip=skip)
+        print path
+        high = highlight.highlight(result.get("content"), 
+            terms, 
+            analysis.StandardAnalyzer(),
+            highlight.SimpleFragmenter(),
+            highlight.HtmlFormatter())
+        objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum })
 
+    return flask.render_template('search.html', objects=objects, term=term, skip=skip)
 
 if __name__ == "__main__":
     app.debug = True
author	yvesf <yvesf-git@xapek.org>	2010-11-20 01:58:53 +0100
committer	yvesf <yvesf-git@xapek.org>	2010-11-20 01:58:53 +0100
commit	966a17b12c9deab35ef0a804d9fa1faea9c8042d (patch)
tree	7ba80df4fcfaaabded6e82a48d139e48e4978952
parent	0ae1e5e802871903d73d6542252aa0a8d08fba39 (diff)
download	booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.tar.gz booksearch-966a17b12c9deab35ef0a804d9fa1faea9c8042d.zip