From edae6bad0f61ae37593aef7efd5386525562c833 Mon Sep 17 00:00:00 2001 From: yvesf Date: Sat, 20 Nov 2010 11:16:59 +0100 Subject: indexer index /delete files --- indexer.py | 55 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/indexer.py b/indexer.py index 5a982a9..22f583e 100644 --- a/indexer.py +++ b/indexer.py @@ -24,20 +24,42 @@ else: index = open_dir("index") filepaths = Queue() -documents = Queue() +documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead notifier = Condition() directory = unicode(sys.argv[1], "utf8") +searcher = index.searcher() print u"Walking {0}".format(directory) filecount = 0 +skipped = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): - filepaths.put(os.path.join(path, filename)) - filecount += 1 - print u"\r{0} files found".format(filecount), + filepath = os.path.join(path, filename) + docnum = searcher.document_number(path=filepath) + if not docnum: + skipped += 1 + else: + filepaths.put(filepath) + filecount += 1 + print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" +writer = index.writer() +deleted = 0 +processed = 0 +for fields in searcher.all_stored_fields(): + path = fields['path'] + processed += 1 + if not os.path.exists(path): + writer.delete_by_term('path', path) + deleted += 1 + print u"\r{0} pages processed. {1} deleted".format(processed, deleted), +print "" + +writer.commit() +searcher.close() + class PDFWorker(Thread): def run(self): while True: @@ -85,31 +107,10 @@ print "all running" for thread in threads: thread.join() +idx.join() + oldindex = index index = None print "optimize index" oldindex.optimize() oldindex.close() - -""" - - try: - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - i=1 - content = "" - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer = index.writer() - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) - writer.commit() - except Exception,e: - print e -""" -- cgit v1.2.1