diff options
-rw-r--r-- | indexer.py | 13 | ||||
-rw-r--r-- | web.py | 15 |
2 files changed, 17 insertions, 11 deletions
@@ -7,6 +7,7 @@ import pyPdf import whoosh.index as index import whoosh.writing as writing import whoosh.fields as fields +from compat import str_format schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), @@ -33,7 +34,7 @@ filepaths = [] directory = unicode(sys.argv[1], "utf8") searcher_book = index_book.searcher() searcher_metadata = index_metadata.searcher() -print u"Walking {0}".format(directory) +print str_format(u"Walking {dir}",dir=directory) filecount = 0 skipped = 0 for path, directories, files in os.walk(directory): @@ -46,7 +47,7 @@ for path, directories, files in os.walk(directory): else: #skip files that are already indexed skipped += 1 - print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), + print str_format(u"\r{count} files found {skip} skipped", count=filecount+skipped, skip=skipped), print "" if not create_index: #update index for deleted files @@ -61,7 +62,7 @@ if not create_index: #update index for deleted files writer_book.delete_by_term(u'path', path) writer_metadata.delete_by_term('path', path) deleted += 1 - print u"\r{0} pages processed. {1} deleted".format(processed, deleted), + print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted), print "" writer_book.commit() writer_metadata.commit() @@ -71,7 +72,7 @@ searcher_metadata.close() def process_file(filepath): try: - print u"{0} processing {1}".format(os.getpid(), filepath) + print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title @@ -82,7 +83,7 @@ def process_file(filepath): pagenumber = 1 for page in inputfile.pages: - print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber) + print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber) content = page.extractText() writer_book.add_document(pagenumber=pagenumber, path=filepath, @@ -93,7 +94,7 @@ def process_file(filepath): except KeyboardInterrupt: return 'KeyboardInterrupt' except Exception,e: - print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e) + print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e) try: import multiprocessing as mp @@ -1,7 +1,11 @@ #!/usr/bin/python2.6 # coding: utf-8 +from __future__ import with_statement import os -import json +try: + import json +except ImportError: + import simplejson as json from StringIO import StringIO from whoosh.index import open_dir from whoosh.qparser import QueryParser @@ -13,6 +17,7 @@ import flask import pyPdf import werkzeug import subprocess +from compat import str_format app = flask.Flask("booksearch") @@ -38,7 +43,7 @@ def pdf_to_image(filepath, page, size): size = 260 density = 60 + 0.15 * size - app.logger.debug("Convert PDF to image page={0} size={1} density={2} filepath={3}".format(page, size, density, filepath)) + app.logger.debug( str_format("Convert PDF to image page={page} size={size} density={density} filepath={filepath}", page=page, size=size, density=density, filepath=filepath )) def generator(process, input): input.seek(0) @@ -51,7 +56,7 @@ def pdf_to_image(filepath, page, size): out = pyPdf.PdfFileWriter() out.addPage(page) out.write(pdffile) - process = subprocess.Popen(["/usr/bin/convert", "-density", "{0}".format(density), "-resize", "{0}x".format(size), "pdf:-", "jpeg:-"], + process = subprocess.Popen(["/usr/bin/convert", "-density", str(density), "-resize", str(size), "pdf:-", "jpeg:-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return werkzeug.Response(generator(process,pdffile), mimetype="image/jpeg") @@ -76,7 +81,7 @@ def do_page_file(docnum): document = reader.stored_fields(docnum) filepath = document['path'] page = document['pagenumber'] - 1 - app.logger.debug("Extract page={0} from filepath={1}".format(page, filepath) ) + app.logger.debug(str_format("Extract page={page} from filepath={filepath}", page=page, filepath=filepath)) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) pdfpage = inputfile.getPage(page) outbuf = StringIO() @@ -86,7 +91,7 @@ def do_page_file(docnum): outbuf.seek(0) r = werkzeug.Response(outbuf, mimetype="application/pdf") client_filename = os.path.basename(filepath)[:-3] - client_filename += u".Page-{0}.pdf".format(page) + client_filename += str_format(u".Page-{page}.pdf", page=page) r.headers.add('Content-Disposition', 'attachment', filename=client_filename) return r |