diff options
-rw-r--r-- | indexer.py | 35 | ||||
-rw-r--r-- | templates/search.html | 31 | ||||
-rw-r--r-- | web.py | 53 |
3 files changed, 73 insertions, 46 deletions
@@ -18,9 +18,11 @@ schema = fields.Schema( createtime=fields.NUMERIC() ) if not os.path.exists("index"): + create = True os.mkdir("index") index = create_in(u"index", schema) else: + create = False index = open_dir("index") filepaths = Queue() @@ -36,7 +38,7 @@ for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): filepath = os.path.join(path, filename) - docnum = searcher.document_number(path=filepath) + docnum = create or searcher.document_number(path=filepath) if not docnum: skipped += 1 else: @@ -45,19 +47,20 @@ for path, directories, files in os.walk(directory): print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" -writer = index.writer() -deleted = 0 -processed = 0 -for fields in searcher.all_stored_fields(): - path = fields['path'] - processed += 1 - if not os.path.exists(path): - writer.delete_by_term('path', path) - deleted += 1 - print u"\r{0} pages processed. {1} deleted".format(processed, deleted), -print "" +if not create: + writer = index.writer() + deleted = 0 + processed = 0 + for fields in searcher.all_stored_fields(): + path = fields['path'] + processed += 1 + if not os.path.exists(path): + writer.delete_by_term('path', path) + deleted += 1 + print u"\r{0} pages processed. {1} deleted".format(processed, deleted), + print "" + writer.commit() -writer.commit() searcher.close() class PDFWorker(Thread): @@ -73,6 +76,7 @@ class PDFWorker(Thread): title = inputfile.getDocumentInfo().title pagenumber = 0 for page in inputfile.pages: + print u"{0} processing {1} Page {2}".format(self.name, filepath, pagenumber) pagenumber += 1 content = page.extractText() documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) @@ -87,7 +91,7 @@ class IndexWorker(Thread): while index != None: try: doc = documents.get(True, 0.5) - except Empty: + except Empty,e: continue print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() @@ -107,10 +111,9 @@ print "all running" for thread in threads: thread.join() -idx.join() - oldindex = index index = None +idx.join() print "optimize index" oldindex.optimize() oldindex.close() diff --git a/templates/search.html b/templates/search.html index 5f68355..e2773ae 100644 --- a/templates/search.html +++ b/templates/search.html @@ -8,26 +8,27 @@ {% endblock %} {% block searchValue %}{{ term }}{% endblock %} {% block content %} - {% if objects.__len__() == 0 %} + {% if match_groups.__len__() == 0 %} No Matches {% else %} - {% for obj in objects %} + {{ resultlen }} Matches in {{ match_groups.__len__() }} files + {% for match_group in match_groups %} <hr /> <h2> - {{ obj['title'] }} - <a href="{{ url_for("do_download_page", docnum=obj['docnum']) }}"> - This Page - </a> - - - <a href="{{ url_for("do_download_file", docnum=obj['docnum']) }}"> - File - </a> + {{ match_group['title'] }} - {{ match_group['filename'] }} + (<a href="{{ url_for("do_download_file", docnum=match_group['first_docnum']) }}"> + Download + </a>) </h2> - <pre> {{ obj['path'] }} </pre> - {% autoescape false %} - <div>{{ obj['excerpt'] }}</div> - {% endautoescape %} - <hr /> + {% for match in match_group['matches'] %} + <h3> + Page {{ match['pagenumber'] }} + (<a href="{{ url_for("do_download_page", docnum=match_group['first_docnum'], page=match['pagenumber']) }}">Download</a>) + </h3> + {% autoescape false %} + <div>{{ match['excerpt'] }}</div> + {% endautoescape %} + {% endfor %} {% endfor %} {% endif %} @@ -28,22 +28,27 @@ def do_download_file(docnum): filename=os.path.basename(document['path'])) return r -@app.route("/download/page/<int:docnum>", methods=["GET"]) -def do_download_page(docnum): +@app.route("/download/page/<int:docnum>/<int:page>", methods=["GET"]) +def do_download_page(docnum,page): document = index.reader().stored_fields(docnum) inputfile = pyPdf.PdfFileReader(file(document['path'], 'r')) - page = inputfile.getPage(document['pagenumber']) + page = inputfile.getPage(page) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() outfile.addPage(page) outfile.write(outbuf) outbuf.seek(0) - r= werkzeug.Response(outbuf, mimetype="application/pdf") + r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(document['path'])[:-3] client_filename += u".Page-{0}".format(document['pagenumber']) r.headers.add('Content-Disposition', 'attachment', filename=client_filename) return r +class MyHtmlFormatter(highlight.HtmlFormatter): + def _format_fragment(self, text, fragment, seen): + text = unicode( flask.Markup.escape(text) ) + return highlight.HtmlFormatter._format_fragment(self, text, fragment, seen) + @app.route("/search/skip=<int:skip>/<path:term>",methods=["GET"]) @app.route("/search/<path:term>",methods=["GET"]) @app.route("/search/", methods=["GET"]) @@ -52,23 +57,41 @@ def do_search(skip=0,term=None): return flask.render_template('search.html', objects=[], term="", skip=0) query = QueryParser("content").parse(term) - results = searcher.search(query, limit=skip+5) + results = searcher.search(query, limit=1001, sortedby="path") terms = [text for fieldname, text in query.all_terms() if fieldname == "content"] - objects = [] - for result in results[skip:skip+5]: + + matches = [] + for result in results: title = result.get("title") path = result.get("path") - print path - high = highlight.highlight(result.get("content"), - terms, - analysis.StandardAnalyzer(), - highlight.SimpleFragmenter(), - highlight.HtmlFormatter()) - objects.append({ 'title' : title, 'path' : path, 'excerpt' : high, 'docnum':result.docnum }) + excerpt = highlight.highlight(result.get("content"), + terms, + analysis.StandardAnalyzer(), + highlight.SimpleFragmenter(), + MyHtmlFormatter()) + matches.append( {'path':result.get('path'), + 'excerpt':excerpt, + 'docnum':result.docnum, + 'pagenumber':result.get('pagenumber'), + 'title':title }) + match_groups = {} + for match in matches: + print match + if not match_groups.has_key(match['path']): + match_groups[match['path']] = { 'matches':[], + 'title' : match['title'], + 'first_docnum' : match['docnum'], + 'filename':os.path.basename(match['path']) } + match_groups[ match['path'] ]['matches'].append( + { 'excerpt':match['excerpt'], + 'docnum':match['docnum'], + 'pagenumber':match['pagenumber'] }) - return flask.render_template('search.html', objects=objects, term=term, skip=skip) + objects = match_groups.values()[skip:skip+5] + return flask.render_template('search.html', + match_groups=objects, term=term, skip=skip, resultlen=len(results)) if __name__ == "__main__": app.debug = True |