diff options
-rw-r--r-- | INSTALL | 3 | ||||
-rw-r--r-- | indexer.py | 97 | ||||
-rw-r--r-- | web.py | 46 |
3 files changed, 111 insertions, 35 deletions
@@ -3,8 +3,6 @@ TODO clean html consistent schema (_book _metadata) ? /usr/bin/convert ? -check pdfminer for better text-extraction (whitespace) - ------------------------ @@ -21,6 +19,7 @@ Install dependencies pip install whoosh pip install pypdf pip install flask + pip install pdfminer ( Clone ) git clone http://xapek.org/~yvesf/repos/booksearch.git @@ -3,11 +3,90 @@ import os import sys import time -import pyPdf import whoosh.index as index import whoosh.writing as writing import whoosh.fields as fields from compat import str_format +import StringIO + + +def pdf_extract_metadata(filepath): + from pdfminer.pdfparser import PDFParser, PDFDocument + from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.converter import TextConverter + from pdfminer.layout import LAParams + from lxml import etree + + outbuf = StringIO.StringIO() + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams()) + parser = PDFParser(file(filepath, "rb")) + doc = PDFDocument() + + parser.set_document(doc) + doc.set_parser(parser) + doc.initialize("") + + namespaces={ + "dc":"http://purl.org/dc/elements/1.1/", + "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "pdf":"http://ns.adobe.com/pdf/1.3/", } + + if doc.catalog.has_key("Metadata"): + obj_ref = doc.catalog["Metadata"] + obj_stream = obj_ref.resolve() + if obj_stream.attrs['Subtype'].name == "XML": + obj_data = obj_stream.get_data() + if obj_data.endswith("\nf"): + obj_data = obj_data[:-len("\nf")] + print obj_data + tree = etree.parse(StringIO.StringIO(obj_data)) + print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text + return obj_data + else: + return None + +def pdf_extract_text_pdfminer(filepath): + from pdfminer.pdfparser import PDFParser, PDFDocument + from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.converter import TextConverter + from pdfminer.layout import LAParams + + outbuf = StringIO.StringIO() + rsrcmgr = PDFResourceManager() + device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams()) + parser = PDFParser(file(filepath, "rb")) + doc = PDFDocument() + + parser.set_document(doc) + doc.set_parser(parser) + doc.initialize("") + + interpreter = PDFPageInterpreter(rsrcmgr, device) + for pagenumber, page in enumerate(doc.get_pages()): + interpreter.process_page(page) + outbuf.seek(0) + content = unicode(outbuf.read(),"utf-8",errors="replace") + yield (pagenumber+1, content) #start pages at 1 + outbuf.seek(0) + +def pdf_extract_text_pypdf(filepath): + import pyPdf + inputfile = pyPdf.PdfFileReader(file(filepath, "r")) + + pagenumber = 1 + for page in inputfile.pages: + content = page.extractText() + yield (pagenumber, content) + pagenumber += 1 + +""" Yields (pagenumber, text) """ +def pdf_extract_text(filepath): + try: + return pdf_extract_text_pdfminer(filepath) + except ImportError: + print "Fallback to pypdf" + return pdf_extract_text_pypdf(filepath) schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), @@ -56,11 +135,11 @@ if not create_index: #update index for deleted files deleted = 0 processed = 0 for fields in searcher_metadata.all_stored_fields(): - path = fields['path'] + path = fields["path"] processed += 1 if not os.path.exists(path): - writer_book.delete_by_term(u'path', path) - writer_metadata.delete_by_term('path', path) + writer_book.delete_by_term(u"path", path) + writer_metadata.delete_by_term("path", path) deleted += 1 print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted), print "" @@ -73,8 +152,7 @@ searcher_metadata.close() def process_file(filepath): try: print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title + title = u"notimplemented" writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) @@ -82,9 +160,8 @@ def process_file(filepath): writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 - for page in inputfile.pages: + for pagenumber, content in pdf_extract_text(filepath): print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber) - content = page.extractText() writer_book.add_document(pagenumber=pagenumber, path=filepath, content=content) @@ -92,9 +169,9 @@ def process_file(filepath): writer_book.commit() except KeyboardInterrupt: - return 'KeyboardInterrupt' + return "KeyboardInterrupt" except Exception,e: - print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e) + print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=str(e)) try: import multiprocessing as mp @@ -33,9 +33,9 @@ def do_index(): def do_book_file(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) - r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",) - r.headers.add('Content-Disposition', 'attachment', - filename=os.path.basename(document['path'])) + r = werkzeug.Response(open(document["path"], "r"), mimetype="application/pdf",) + r.headers.add("Content-Disposition", "attachment", + filename=os.path.basename(document["path"])) return r @@ -53,7 +53,7 @@ def pdf_to_image(filepath, page, size): if stdout: yield stdout pdffile = StringIO() - page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page) + page = pyPdf.PdfFileReader(file(filepath, "r")).getPage(page) out = pyPdf.PdfFileWriter() out.addPage(page) out.write(pdffile) @@ -66,24 +66,24 @@ def pdf_to_image(filepath, page, size): def do_page_image(docnum,size=260): with index_book.reader() as reader: document = reader.stored_fields(docnum) - page = document['pagenumber'] - 1 - return pdf_to_image(document['path'], page, size=size) + page = document["pagenumber"] - 1 + return pdf_to_image(document["path"], page, size=size) @app.route("/book/frontpage/<int:docnum>", methods=["GET"]) def do_book_frontpage(docnum): with index_metadata.reader() as reader: document = reader.stored_fields(docnum) - return pdf_to_image(document['path'], 0, 260) + return pdf_to_image(document["path"], 0, 260) @app.route("/page/file/<int:docnum>", methods=["GET"]) def do_page_file(docnum): with index_book.reader() as reader: document = reader.stored_fields(docnum) - filepath = document['path'] - page = document['pagenumber'] - 1 + filepath = document["path"] + page = document["pagenumber"] - 1 app.logger.debug(str_format("Extract page={page} from filepath={filepath}", page=page, filepath=filepath)) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + inputfile = pyPdf.PdfFileReader(file(filepath, "r")) pdfpage = inputfile.getPage(page) outbuf = StringIO() outfile = pyPdf.PdfFileWriter() @@ -93,7 +93,7 @@ def do_page_file(docnum): r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(filepath)[:-3] client_filename += str_format(u".Page-{page}.pdf", page=page) - r.headers.add('Content-Disposition', 'attachment', filename=client_filename) + r.headers.add("Content-Disposition", "attachment", filename=client_filename) return r class MyHtmlFormatter(highlight.HtmlFormatter): @@ -110,10 +110,10 @@ def do_excerpt(docnum, term): q = q.simplify(reader) terms = [ text for fieldname, text in q.all_terms() if fieldname == "content" ] - excerpt = highlight.highlight(document['content'], + excerpt = highlight.highlight(document["content"], terms, analysis.StandardAnalyzer(), - highlight.SimpleFragmenter(), + highlight.ContextFragmenter(terms, surround=40), MyHtmlFormatter()) return unicode( excerpt ) @@ -121,7 +121,7 @@ def do_excerpt(docnum, term): @app.route("/search/", methods=["GET"]) def do_search(term=None): if not term: - return flask.render_template('search.html', objects=[], term="") + return flask.render_template("search.html", objects=[], term="") term = term.lower() searcher = index_book.searcher() @@ -138,23 +138,23 @@ def do_search(term=None): with index_metadata.searcher() as searcher: docnum = searcher.document_number(path=filepath) with index_metadata.reader() as reader2: - title = reader2.stored_fields(docnum).get('title') + title = reader2.stored_fields(docnum).get("title") books[docnum] = { - 'matches' : [], - 'title':title, - 'filename' : os.path.basename(filepath), + "matches" : [], + "title":title, + "filename" : os.path.basename(filepath), } for match in book[1]: - pagenumber = reader.stored_fields(match[0])['pagenumber'] + pagenumber = reader.stored_fields(match[0])["pagenumber"] match = (match[0], match[1], pagenumber) - books[docnum]['matches'].append(match) + books[docnum]["matches"].append(match) - return flask.render_template('search.html', + return flask.render_template("search.html", books=books, term=term) def log_response(sender, response): - sender.logger.debug('Request context is about to close down. ' - 'Response: %s', response) + sender.logger.debug("Request context is about to close down. " + "Response: %s", response) if __name__ == "__main__": app.run(host="0.0.0.0", port=8000) |