diff options
Diffstat (limited to 'indexer.py')
-rw-r--r-- | indexer.py | 43 |
1 files changed, 20 insertions, 23 deletions
@@ -2,17 +2,16 @@ # coding: utf-8 import os import sys +import time import pyPdf -from whoosh.index import create_in, open_dir +import whoosh.index as index +import whoosh.writing as writing import whoosh.fields as fields -import time -from cStringIO import StringIO -from Queue import Queue, Empty import multiprocessing as mp schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), - metadata_docnum=fields.NUMERIC(stored=True), + path=fields.ID(stored=True), content=fields.TEXT(stored=True), ) @@ -24,13 +23,12 @@ schema_metadata = fields.Schema( if not os.path.exists(u"index"): create_index = True os.mkdir(u"index") - index_book = create_in(u"index", schema_book, u"book") - index_metadata = create_in(u"index", schema_metadata, u"metadata") + index_book = index.create_in(u"index", schema_book, u"book") + index_metadata = index.create_in(u"index", schema_metadata, u"metadata") else: create_index = False - index_book = open_dir(u"index", u"book") - index_metadata = open_dir(u"index", u"metadata") - + index_book = index.open_dir(u"index", u"book") + index_metadata = index.open_dir(u"index", u"metadata") filepaths = [] directory = unicode(sys.argv[1], "utf8") @@ -43,12 +41,12 @@ for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(u".pdf"): filepath = os.path.join(path, filename) - docnum = create_index or searcher_metadata.document_number(path=filepath) - if not docnum: - skipped += 1 - else: + if create_index or not searcher_metadata.document_number(path=filepath): filepaths.append(filepath) filecount += 1 + else: + #skip files that are already indexed + skipped += 1 print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" @@ -61,7 +59,7 @@ if not create_index: #update index for deleted files path = fields['path'] processed += 1 if not os.path.exists(path): - writer_book.delete_by_term(u'metadata_docnum', searcher_metadata.document_number(path=path)) + writer_book.delete_by_term(u'path', path) writer_metadata.delete_by_term('path', path) deleted += 1 print u"\r{0} pages processed. {1} deleted".format(processed, deleted), @@ -81,20 +79,18 @@ def process_file(filepath): writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) writer_metadata.commit() - searcher_metadata = index_metadata.searcher() - metadata_docnum = searcher_metadata.document_number(path=filepath) - searcher_metadata.close() + writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 for page in inputfile.pages: print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber) content = page.extractText() - writer_book = index_book.writer() writer_book.add_document(pagenumber=pagenumber, - metadata_docnum=metadata_docnum, - content=content) - writer_book.commit() + path=filepath, + content=content) pagenumber += 1 + + writer_book.commit() except KeyboardInterrupt: return 'KeyboardInterrupt' except Exception,e: @@ -111,8 +107,9 @@ except KeyboardInterrupt: pool.terminate() except ImportError: for filepath in filepaths: - #if process_file(filepath) == "KeyboardInterrupt": + if process_file(filepath) == "KeyboardInterrupt": break + print u"optimize indexes" index_metadata.optimize() index_metadata.close() |