diff options
Diffstat (limited to 'indexer.py')
-rw-r--r-- | indexer.py | 35 |
1 files changed, 19 insertions, 16 deletions
@@ -18,9 +18,11 @@ schema = fields.Schema( createtime=fields.NUMERIC() ) if not os.path.exists("index"): + create = True os.mkdir("index") index = create_in(u"index", schema) else: + create = False index = open_dir("index") filepaths = Queue() @@ -36,7 +38,7 @@ for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): filepath = os.path.join(path, filename) - docnum = searcher.document_number(path=filepath) + docnum = create or searcher.document_number(path=filepath) if not docnum: skipped += 1 else: @@ -45,19 +47,20 @@ for path, directories, files in os.walk(directory): print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" -writer = index.writer() -deleted = 0 -processed = 0 -for fields in searcher.all_stored_fields(): - path = fields['path'] - processed += 1 - if not os.path.exists(path): - writer.delete_by_term('path', path) - deleted += 1 - print u"\r{0} pages processed. {1} deleted".format(processed, deleted), -print "" +if not create: + writer = index.writer() + deleted = 0 + processed = 0 + for fields in searcher.all_stored_fields(): + path = fields['path'] + processed += 1 + if not os.path.exists(path): + writer.delete_by_term('path', path) + deleted += 1 + print u"\r{0} pages processed. {1} deleted".format(processed, deleted), + print "" + writer.commit() -writer.commit() searcher.close() class PDFWorker(Thread): @@ -73,6 +76,7 @@ class PDFWorker(Thread): title = inputfile.getDocumentInfo().title pagenumber = 0 for page in inputfile.pages: + print u"{0} processing {1} Page {2}".format(self.name, filepath, pagenumber) pagenumber += 1 content = page.extractText() documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } ) @@ -87,7 +91,7 @@ class IndexWorker(Thread): while index != None: try: doc = documents.get(True, 0.5) - except Empty: + except Empty,e: continue print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber']) writer = index.writer() @@ -107,10 +111,9 @@ print "all running" for thread in threads: thread.join() -idx.join() - oldindex = index index = None +idx.join() print "optimize index" oldindex.optimize() oldindex.close() |