summaryrefslogtreecommitdiff
path: root/indexer.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-20 20:39:51 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-20 20:39:51 +0100
commit7385d9f5ba2cba49db24ee9c874f08fff641cb20 (patch)
tree0e0313a45000348561f244741903d8e7aa5cc368 /indexer.py
parentf87201ef681eb3e94cb570c5c83a22d2e8b269d2 (diff)
downloadbooksearch-7385d9f5ba2cba49db24ee9c874f08fff641cb20.tar.gz
booksearch-7385d9f5ba2cba49db24ee9c874f08fff641cb20.zip
multiprocessing fix
Diffstat (limited to 'indexer.py')
-rw-r--r--indexer.py54
1 files changed, 33 insertions, 21 deletions
diff --git a/indexer.py b/indexer.py
index af47ead..57e1c0f 100644
--- a/indexer.py
+++ b/indexer.py
@@ -73,34 +73,46 @@ searcher_book.close()
searcher_metadata.close()
def process_file(filepath):
- print u"{0} processing {1}".format(os.getpid(), filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
- writer_metadata = index_metadata.writer()
- writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
- writer_metadata.commit()
- searcher_metadata = index_metadata.searcher()
- metadata_docnum = searcher_metadata.document_number(path=filepath)
- searcher_metadata.close()
+ try:
+ print u"{0} processing {1}".format(os.getpid(), filepath)
+ inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ title = inputfile.getDocumentInfo().title
+ writer_metadata = index_metadata.writer()
+ writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
+ writer_metadata.commit()
+ searcher_metadata = index_metadata.searcher()
+ metadata_docnum = searcher_metadata.document_number(path=filepath)
+ searcher_metadata.close()
+
+ pagenumber = 1
+ for page in inputfile.pages:
+ print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber)
+ content = page.extractText()
+ writer_book = index_book.writer()
+ writer_book.add_document(pagenumber=pagenumber,
+ metadata_docnum=metadata_docnum,
+ content=content)
+ writer_book.commit()
+ pagenumber += 1
+ except KeyboardInterrupt:
+ return 'KeyboardInterrupt'
+ except Exception,e:
+ print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e)
- pagenumber = 1
- for page in inputfile.pages:
- print u"processing {0} Page {1}".format(filepath, pagenumber)
- content = page.extractText()
- writer_book = index_book.writer()
- writer_book.add_document(pagenumber=pagenumber,
- metadata_docnum=metadata_docnum,
- content=content)
- writer_book.commit()
- pagenumber += 1
try:
pool = mp.Pool()
- pool.apply(process_file, filepaths)
+ jobs = []
+ for filepath in filepaths:
+ jobs.append( pool.apply_async( process_file, (filepath,) ) )
+ pool.close()
+ pool.join()
+except KeyboardInterrupt:
+ pool.terminate()
except ImportError:
for filepath in filepaths:
process_file(filepath)
print u"optimize indexes"
index_metadata.optimize()
index_metadata.close()
-index_book.optimze()
+index_book.optimize()
index_book.close()