summaryrefslogtreecommitdiff
path: root/indexer.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-24 21:44:28 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-24 21:44:28 +0100
commit0d256d1dacb1d38105859a717db0e05062d89c74 (patch)
tree7d7f4e766a7c94724267414a87e7dd5f527c231e /indexer.py
parent35efef41d80413481167f9a3ec779a85c7800e4b (diff)
parent708137185642f408427372300144412a4102ec38 (diff)
downloadbooksearch-0d256d1dacb1d38105859a717db0e05062d89c74.tar.gz
booksearch-0d256d1dacb1d38105859a717db0e05062d89c74.zip
Merge branch 'master' of ssh://192.168.0.6/home/yvesf/virtualenv/booksearch into HEAD
Conflicts: indexer.py
Diffstat (limited to 'indexer.py')
-rw-r--r--indexer.py13
1 files changed, 7 insertions, 6 deletions
diff --git a/indexer.py b/indexer.py
index 3f1ecd6..913591a 100644
--- a/indexer.py
+++ b/indexer.py
@@ -7,6 +7,7 @@ import pyPdf
import whoosh.index as index
import whoosh.writing as writing
import whoosh.fields as fields
+from compat import str_format
schema_book = fields.Schema(
pagenumber=fields.NUMERIC(stored=True),
@@ -33,7 +34,7 @@ filepaths = []
directory = unicode(sys.argv[1], "utf8")
searcher_book = index_book.searcher()
searcher_metadata = index_metadata.searcher()
-print u"Walking {0}".format(directory)
+print str_format(u"Walking {dir}",dir=directory)
filecount = 0
skipped = 0
for path, directories, files in os.walk(directory):
@@ -46,7 +47,7 @@ for path, directories, files in os.walk(directory):
else:
#skip files that are already indexed
skipped += 1
- print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
+ print str_format(u"\r{count} files found {skip} skipped", count=filecount+skipped, skip=skipped),
print ""
if not create_index: #update index for deleted files
@@ -61,7 +62,7 @@ if not create_index: #update index for deleted files
writer_book.delete_by_term(u'path', path)
writer_metadata.delete_by_term('path', path)
deleted += 1
- print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
+ print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted),
print ""
writer_book.commit()
writer_metadata.commit()
@@ -71,7 +72,7 @@ searcher_metadata.close()
def process_file(filepath):
try:
- print u"{0} processing {1}".format(os.getpid(), filepath)
+ print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
@@ -82,7 +83,7 @@ def process_file(filepath):
pagenumber = 1
for page in inputfile.pages:
- print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber)
+ print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber)
content = page.extractText()
writer_book.add_document(pagenumber=pagenumber,
path=filepath,
@@ -93,7 +94,7 @@ def process_file(filepath):
except KeyboardInterrupt:
return 'KeyboardInterrupt'
except Exception,e:
- print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e)
+ print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e)
try:
import multiprocessing as mp