summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--indexer.py35
-rw-r--r--templates/search.html31
-rw-r--r--web.py53
3 files changed, 73 insertions, 46 deletions
diff --git a/indexer.py b/indexer.py
index 22f583e..ac14a9e 100644
--- a/indexer.py
+++ b/indexer.py
@@ -18,9 +18,11 @@ schema = fields.Schema(
createtime=fields.NUMERIC() )
if not os.path.exists("index"):
+ create = True
os.mkdir("index")
index = create_in(u"index", schema)
else:
+ create = False
index = open_dir("index")
filepaths = Queue()
@@ -36,7 +38,7 @@ for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(".pdf"):
filepath = os.path.join(path, filename)
- docnum = searcher.document_number(path=filepath)
+ docnum = create or searcher.document_number(path=filepath)
if not docnum:
skipped += 1
else:
@@ -45,19 +47,20 @@ for path, directories, files in os.walk(directory):
print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
print ""
-writer = index.writer()
-deleted = 0
-processed = 0
-for fields in searcher.all_stored_fields():
- path = fields['path']
- processed += 1
- if not os.path.exists(path):
- writer.delete_by_term('path', path)
- deleted += 1
- print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
-print ""
+if not create:
+ writer = index.writer()
+ deleted = 0
+ processed = 0
+ for fields in searcher.all_stored_fields():
+ path = fields['path']
+ processed += 1
+ if not os.path.exists(path):
+ writer.delete_by_term('path', path)
+ deleted += 1
+ print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
+ print ""
+ writer.commit()
-writer.commit()
searcher.close()
class PDFWorker(Thread):
@@ -73,6 +76,7 @@ class PDFWorker(Thread):
title = inputfile.getDocumentInfo().title
pagenumber = 0
for page in inputfile.pages:
+ print u"{0} processing {1} Page {2}".format(self.name, filepath, pagenumber)
pagenumber += 1
content = page.extractText()
documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
@@ -87,7 +91,7 @@ class IndexWorker(Thread):
while index != None:
try:
doc = documents.get(True, 0.5)
- except Empty:
+ except Empty,e:
continue
print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
writer = index.writer()
@@ -107,10 +111,9 @@ print "all running"
for thread in threads:
thread.join()
-idx.join()
-
oldindex = index
index = None
+idx.join()
print "optimize index"
oldindex.optimize()
oldindex.close()
diff --git a/templates/search.html b/templates/search.html
index 5f68355..e2773ae 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -8,26 +8,27 @@
{% endblock %}
{% block searchValue %}{{ term }}{% endblock %}
{% block content %}
- {% if objects.__len__() == 0 %}
+ {% if match_groups.__len__() == 0 %}
No Matches
{% else %}
- {% for obj in objects %}
+ {{ resultlen }} Matches in {{ match_groups.__len__() }} files
+ {% for match_group in match_groups %}
<hr />
<h2>
- {{ obj['title'] }}
- <a href="{{ url_for("do_download_page", docnum=obj['docnum']) }}">
- This Page
- </a>
- -
- <a href="{{ url_for("do_download_file", docnum=obj['docnum']) }}">
- File
- </a>
+ {{ match_group['title'] }} - {{ match_group['filename'] }}
+ (<a href="{{ url_for("do_download_file", docnum=match_group['first_docnum']) }}">
+ Download
+ </a>)
</h2>
- <pre> {{ obj['path'] }} </pre>
- {% autoescape false %}
- <div>{{ obj['excerpt'] }}</div>
- {% endautoescape %}
- <hr />
+ {% for match in match_group['matches'] %}
+ <h3>
+ Page {{ match['pagenumber'] }}
+ (<a href="{{ url_for("do_download_page", docnum=match_group['first_docnum'], page=match['pagenumber']) }}">Download</a>)
+ </h3>
+ {% autoescape false %}
+ <div>{{ match['excerpt'] }}</div>
+ {% endautoescape %}
+ {% endfor %}
{% endfor %}
{% endif %}
diff --git a/web.py b/web.py
index b262e1d..b5494c6 100644
--- a/web.py
+++ b/web.py
@@ -28,22 +28,27 @@ def do_download_file(docnum):
filename=os.path.basename(document['path']))
return r
-@app.route("/download/page/<int:docnum>", methods=["GET"])
-def do_download_page(docnum):
+@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"])
+def do_download_page(docnum,page):
document = index.reader().stored_fields(docnum)
inputfile = pyPdf.PdfFileReader(file(document['path'], 'r'))
- page = inputfile.getPage(document['pagenumber'])
+ page = inputfile.getPage(page)
outbuf = StringIO()
outfile = pyPdf.PdfFileWriter()
outfile.addPage(page)
outfile.write(outbuf)
outbuf.seek(0)
- r= werkzeug.Response(outbuf, mimetype="application/pdf")
+ r = werkzeug.Response(outbuf, mimetype="application/pdf")
client_filename = os.path.basename(document['path'])[:-3]
client_filename += u".Page-{0}".format(document['pagenumber'])
r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
return r
+class MyHtmlFormatter(highlight.HtmlFormatter):
+ def _format_fragment(self, text, fragment, seen):
+ text = unicode( flask.Markup.escape(text) )
+ return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen)
+
@app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"])
@app.route("/search/<path:term>",methods=["GET"])
@app.route("/search/", methods=["GET"])
@@ -52,23 +57,41 @@ def do_search(skip=0,term=None):
return flask.render_template('search.html', objects=[], term="", skip=0)
query = QueryParser("content").parse(term)
- results = searcher.search(query, limit=skip+5)
+ results = searcher.search(query, limit=1001, sortedby="path")
terms = [text for fieldname, text in query.all_terms()
if fieldname == "content"]
- objects = []
- for result in results[skip:skip+5]:
+
+ matches = []
+ for result in results:
title = result.get("title")
path = result.get("path")
- print path
- high = highlight.highlight(result.get("content"),
- terms,
- analysis.StandardAnalyzer(),
- highlight.SimpleFragmenter(),
- highlight.HtmlFormatter())
- objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum })
+ excerpt = highlight.highlight(result.get("content"),
+ terms,
+ analysis.StandardAnalyzer(),
+ highlight.SimpleFragmenter(),
+ MyHtmlFormatter())
+ matches.append( {'path':result.get('path'),
+ 'excerpt':excerpt,
+ 'docnum':result.docnum,
+ 'pagenumber':result.get('pagenumber'),
+ 'title':title })
+ match_groups = {}
+ for match in matches:
+ print match
+ if not match_groups.has_key(match['path']):
+ match_groups[match['path']] = { 'matches':[],
+ 'title' : match['title'],
+ 'first_docnum' : match['docnum'],
+ 'filename':os.path.basename(match['path']) }
+ match_groups[ match['path'] ]['matches'].append(
+ { 'excerpt':match['excerpt'],
+ 'docnum':match['docnum'],
+ 'pagenumber':match['pagenumber'] })
- return flask.render_template('search.html', objects=objects, term=term, skip=skip)
+ objects = match_groups.values()[skip:skip+5]
+ return flask.render_template('search.html',
+ match_groups=objects, term=term, skip=skip, resultlen=len(results))
if __name__ == "__main__":
app.debug = True