diff options
-rw-r--r-- | indexer.py | 54 |
1 files changed, 33 insertions, 21 deletions
@@ -73,34 +73,46 @@ searcher_book.close() searcher_metadata.close() def process_file(filepath): - print u"{0} processing {1}".format(os.getpid(), filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - writer_metadata = index_metadata.writer() - writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) - writer_metadata.commit() - searcher_metadata = index_metadata.searcher() - metadata_docnum = searcher_metadata.document_number(path=filepath) - searcher_metadata.close() + try: + print u"{0} processing {1}".format(os.getpid(), filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + writer_metadata = index_metadata.writer() + writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) + writer_metadata.commit() + searcher_metadata = index_metadata.searcher() + metadata_docnum = searcher_metadata.document_number(path=filepath) + searcher_metadata.close() + + pagenumber = 1 + for page in inputfile.pages: + print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber) + content = page.extractText() + writer_book = index_book.writer() + writer_book.add_document(pagenumber=pagenumber, + metadata_docnum=metadata_docnum, + content=content) + writer_book.commit() + pagenumber += 1 + except KeyboardInterrupt: + return 'KeyboardInterrupt' + except Exception,e: + print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e) - pagenumber = 1 - for page in inputfile.pages: - print u"processing {0} Page {1}".format(filepath, pagenumber) - content = page.extractText() - writer_book = index_book.writer() - writer_book.add_document(pagenumber=pagenumber, - metadata_docnum=metadata_docnum, - content=content) - writer_book.commit() - pagenumber += 1 try: pool = mp.Pool() - pool.apply(process_file, filepaths) + jobs = [] + for filepath in filepaths: + jobs.append( pool.apply_async( process_file, (filepath,) ) ) + pool.close() + pool.join() +except KeyboardInterrupt: + pool.terminate() except ImportError: for filepath in filepaths: process_file(filepath) print u"optimize indexes" index_metadata.optimize() index_metadata.close() -index_book.optimze() +index_book.optimize() index_book.close() |