From 7385d9f5ba2cba49db24ee9c874f08fff641cb20 Mon Sep 17 00:00:00 2001 From: yvesf Date: Sat, 20 Nov 2010 20:39:51 +0100 Subject: multiprocessing fix --- indexer.py | 54 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 21 deletions(-) (limited to 'indexer.py') diff --git a/indexer.py b/indexer.py index af47ead..57e1c0f 100644 --- a/indexer.py +++ b/indexer.py @@ -73,34 +73,46 @@ searcher_book.close() searcher_metadata.close() def process_file(filepath): - print u"{0} processing {1}".format(os.getpid(), filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - writer_metadata = index_metadata.writer() - writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) - writer_metadata.commit() - searcher_metadata = index_metadata.searcher() - metadata_docnum = searcher_metadata.document_number(path=filepath) - searcher_metadata.close() + try: + print u"{0} processing {1}".format(os.getpid(), filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + writer_metadata = index_metadata.writer() + writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) + writer_metadata.commit() + searcher_metadata = index_metadata.searcher() + metadata_docnum = searcher_metadata.document_number(path=filepath) + searcher_metadata.close() + + pagenumber = 1 + for page in inputfile.pages: + print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber) + content = page.extractText() + writer_book = index_book.writer() + writer_book.add_document(pagenumber=pagenumber, + metadata_docnum=metadata_docnum, + content=content) + writer_book.commit() + pagenumber += 1 + except KeyboardInterrupt: + return 'KeyboardInterrupt' + except Exception,e: + print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e) - pagenumber = 1 - for page in inputfile.pages: - print u"processing {0} Page {1}".format(filepath, pagenumber) - content = page.extractText() - writer_book = index_book.writer() - writer_book.add_document(pagenumber=pagenumber, - metadata_docnum=metadata_docnum, - content=content) - writer_book.commit() - pagenumber += 1 try: pool = mp.Pool() - pool.apply(process_file, filepaths) + jobs = [] + for filepath in filepaths: + jobs.append( pool.apply_async( process_file, (filepath,) ) ) + pool.close() + pool.join() +except KeyboardInterrupt: + pool.terminate() except ImportError: for filepath in filepaths: process_file(filepath) print u"optimize indexes" index_metadata.optimize() index_metadata.close() -index_book.optimze() +index_book.optimize() index_book.close() -- cgit v1.2.1