3 files changed, 73 insertions, 46 deletions
diff --git a/indexer.py b/indexer.py
index 22f583e..ac14a9e 100644
--- a/indexer.py
+++ b/indexer.py
@@ -18,9 +18,11 @@ schema = fields.Schema(
     createtime=fields.NUMERIC() )
 
 if not os.path.exists("index"):
+    create = True
     os.mkdir("index")
     index = create_in(u"index", schema)
 else:
+    create = False
     index = open_dir("index")
 
 filepaths = Queue()
@@ -36,7 +38,7 @@ for path, directories, files in os.walk(directory):
     for filename in files:
         if filename.endswith(".pdf"):
             filepath = os.path.join(path, filename)
-            docnum = searcher.document_number(path=filepath)
+            docnum = create or searcher.document_number(path=filepath)
             if not docnum:
                 skipped += 1
             else:
@@ -45,19 +47,20 @@ for path, directories, files in os.walk(directory):
             print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
 print ""
 
-writer = index.writer()
-deleted = 0
-processed = 0
-for fields in searcher.all_stored_fields():
-    path = fields['path']
-    processed += 1
-    if not os.path.exists(path):
-        writer.delete_by_term('path', path)
-        deleted += 1
-    print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
-print ""
+if not create:
+    writer = index.writer()
+    deleted = 0
+    processed = 0
+    for fields in searcher.all_stored_fields():
+        path = fields['path']
+        processed += 1
+        if not os.path.exists(path):
+            writer.delete_by_term('path', path)
+            deleted += 1
+        print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
+    print ""
+    writer.commit()
 
-writer.commit()
 searcher.close()
 
 class PDFWorker(Thread):
@@ -73,6 +76,7 @@ class PDFWorker(Thread):
                 title = inputfile.getDocumentInfo().title
                 pagenumber = 0
                 for page in inputfile.pages:
+                    print u"{0} processing {1} Page {2}".format(self.name, filepath, pagenumber)
                     pagenumber += 1
                     content = page.extractText()
                     documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
@@ -87,7 +91,7 @@ class IndexWorker(Thread):
         while index != None:
             try:
                 doc = documents.get(True, 0.5)
-            except Empty:
+            except Empty,e:
                 continue
             print u"{0} adding     {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
             writer = index.writer()
@@ -107,10 +111,9 @@ print "all running"
 for thread in threads:
     thread.join()
 
-idx.join()
-
 oldindex = index
 index = None
+idx.join()
 print "optimize index"
 oldindex.optimize()
 oldindex.close()
diff --git a/templates/search.html b/templates/search.html
index 5f68355..e2773ae 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -8,26 +8,27 @@
 {% endblock %}
 {% block searchValue %}{{ term }}{% endblock %}
 {% block content %}
-        {% if objects.__len__() == 0 %}
+        {% if match_groups.__len__() == 0 %}
             No Matches
         {% else %}
-            {% for obj in objects %}
+            {{ resultlen }} Matches in {{ match_groups.__len__() }} files
+            {% for match_group in match_groups %}
                 <hr />
                 <h2>
-                        {{ obj['title'] }}
-                    <a href="{{ url_for("do_download_page", docnum=obj['docnum']) }}">
-                        This Page
-                    </a>
-                    -
-                    <a href="{{ url_for("do_download_file", docnum=obj['docnum']) }}">
-                        File
-                    </a>
+                    {{ match_group['title'] }} - {{ match_group['filename'] }}
+                    (<a href="{{ url_for("do_download_file", docnum=match_group['first_docnum']) }}">
+                        Download
+                    </a>)
                 </h2>
-                <pre> {{ obj['path'] }} </pre>
-                {% autoescape false %}
-                    <div>{{ obj['excerpt'] }}</div>
-                {% endautoescape %}
-                <hr />
+                {% for match in match_group['matches'] %}
+                    <h3>
+                        Page {{ match['pagenumber'] }}
+                        (<a href="{{ url_for("do_download_page", docnum=match_group['first_docnum'], page=match['pagenumber']) }}">Download</a>)
+                    </h3>
+                    {% autoescape false %}
+                        <div>{{ match['excerpt'] }}</div>
+                    {% endautoescape %}
+                {% endfor %}
             {% endfor %}
         {% endif %}
 
diff --git a/web.py b/web.py
index b262e1d..b5494c6 100644
--- a/web.py
+++ b/web.py
@@ -28,22 +28,27 @@ def do_download_file(docnum):
         filename=os.path.basename(document['path']))
     return r
 
-@app.route("/download/page/<int:docnum>", methods=["GET"])
-def do_download_page(docnum):
+@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"])
+def do_download_page(docnum,page):
     document = index.reader().stored_fields(docnum)
     inputfile = pyPdf.PdfFileReader(file(document['path'], 'r'))
-    page = inputfile.getPage(document['pagenumber'])
+    page = inputfile.getPage(page)
     outbuf = StringIO()
     outfile = pyPdf.PdfFileWriter()
     outfile.addPage(page)
     outfile.write(outbuf)
     outbuf.seek(0)
-    r= werkzeug.Response(outbuf, mimetype="application/pdf")
+    r = werkzeug.Response(outbuf, mimetype="application/pdf")
     client_filename = os.path.basename(document['path'])[:-3]
     client_filename += u".Page-{0}".format(document['pagenumber'])
     r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
     return r
 
+class MyHtmlFormatter(highlight.HtmlFormatter):
+    def _format_fragment(self, text, fragment, seen):
+        text = unicode( flask.Markup.escape(text) )
+        return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen)
+
 @app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
 @app.route("/search/<path:term>",methods=["GET"])
 @app.route("/search/", methods=["GET"])
@@ -52,23 +57,41 @@ def do_search(skip=0,term=None):
         return flask.render_template('search.html', objects=[], term="", skip=0)
 
     query = QueryParser("content").parse(term)
-    results = searcher.search(query, limit=skip+5)
+    results = searcher.search(query, limit=1001, sortedby="path")
 
     terms = [text for fieldname, text in query.all_terms()
                     if fieldname == "content"]
-    objects = []
-    for result in results[skip:skip+5]:
+
+    matches = []
+    for result in results:
         title = result.get("title")
         path = result.get("path")
-        print path
-        high = highlight.highlight(result.get("content"), 
-            terms, 
-            analysis.StandardAnalyzer(),
-            highlight.SimpleFragmenter(),
-            highlight.HtmlFormatter())
-        objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum })
+        excerpt = highlight.highlight(result.get("content"), 
+                    terms, 
+                    analysis.StandardAnalyzer(),
+                    highlight.SimpleFragmenter(),
+                    MyHtmlFormatter())
+        matches.append( {'path':result.get('path'),
+                         'excerpt':excerpt, 
+                         'docnum':result.docnum, 
+                         'pagenumber':result.get('pagenumber'), 
+                         'title':title })
+    match_groups = {}
+    for match in matches:
+        print match
+        if not match_groups.has_key(match['path']):
+            match_groups[match['path']] = { 'matches':[],
+                                            'title' : match['title'],
+                                            'first_docnum' : match['docnum'],
+                                            'filename':os.path.basename(match['path']) }
+        match_groups[ match['path'] ]['matches'].append(
+            { 'excerpt':match['excerpt'],
+              'docnum':match['docnum'],
+              'pagenumber':match['pagenumber'] })
 
-    return flask.render_template('search.html', objects=objects, term=term, skip=skip)
+    objects = match_groups.values()[skip:skip+5]
+    return flask.render_template('search.html', 
+        match_groups=objects, term=term, skip=skip, resultlen=len(results))
 
 if __name__ == "__main__":
     app.debug = True