summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--indexer.py54
1 files changed, 33 insertions, 21 deletions
diff --git a/indexer.py b/indexer.py
index af47ead..57e1c0f 100644
--- a/indexer.py
+++ b/indexer.py
@@ -73,34 +73,46 @@ searcher_book.close()
searcher_metadata.close()
def process_file(filepath):
- print u"{0} processing {1}".format(os.getpid(), filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
- writer_metadata = index_metadata.writer()
- writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
- writer_metadata.commit()
- searcher_metadata = index_metadata.searcher()
- metadata_docnum = searcher_metadata.document_number(path=filepath)
- searcher_metadata.close()
+ try:
+ print u"{0} processing {1}".format(os.getpid(), filepath)
+ inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ title = inputfile.getDocumentInfo().title
+ writer_metadata = index_metadata.writer()
+ writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
+ writer_metadata.commit()
+ searcher_metadata = index_metadata.searcher()
+ metadata_docnum = searcher_metadata.document_number(path=filepath)
+ searcher_metadata.close()
+
+ pagenumber = 1
+ for page in inputfile.pages:
+ print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber)
+ content = page.extractText()
+ writer_book = index_book.writer()
+ writer_book.add_document(pagenumber=pagenumber,
+ metadata_docnum=metadata_docnum,
+ content=content)
+ writer_book.commit()
+ pagenumber += 1
+ except KeyboardInterrupt:
+ return 'KeyboardInterrupt'
+ except Exception,e:
+ print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e)
- pagenumber = 1
- for page in inputfile.pages:
- print u"processing {0} Page {1}".format(filepath, pagenumber)
- content = page.extractText()
- writer_book = index_book.writer()
- writer_book.add_document(pagenumber=pagenumber,
- metadata_docnum=metadata_docnum,
- content=content)
- writer_book.commit()
- pagenumber += 1
try:
pool = mp.Pool()
- pool.apply(process_file, filepaths)
+ jobs = []
+ for filepath in filepaths:
+ jobs.append( pool.apply_async( process_file, (filepath,) ) )
+ pool.close()
+ pool.join()
+except KeyboardInterrupt:
+ pool.terminate()
except ImportError:
for filepath in filepaths:
process_file(filepath)
print u"optimize indexes"
index_metadata.optimize()
index_metadata.close()
-index_book.optimze()
+index_book.optimize()
index_book.close()