diff options
author | yvesf <yvesf-git@xapek.org> | 2010-11-20 11:16:59 +0100 |
---|---|---|
committer | yvesf <yvesf-git@xapek.org> | 2010-11-20 11:16:59 +0100 |
commit | edae6bad0f61ae37593aef7efd5386525562c833 (patch) | |
tree | 75355f2a80497e3f07b1a1359092fcd84d60e54c | |
parent | bec8150c93a640ac4b6d1cebc86bd721dfdd6b59 (diff) | |
download | booksearch-edae6bad0f61ae37593aef7efd5386525562c833.tar.gz booksearch-edae6bad0f61ae37593aef7efd5386525562c833.zip |
indexer index /delete files
-rw-r--r-- | indexer.py | 55 |
1 files changed, 28 insertions, 27 deletions
@@ -24,20 +24,42 @@ else: index = open_dir("index") filepaths = Queue() -documents = Queue() +documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead notifier = Condition() directory = unicode(sys.argv[1], "utf8") +searcher = index.searcher() print u"Walking {0}".format(directory) filecount = 0 +skipped = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): - filepaths.put(os.path.join(path, filename)) - filecount += 1 - print u"\r{0} files found".format(filecount), + filepath = os.path.join(path, filename) + docnum = searcher.document_number(path=filepath) + if not docnum: + skipped += 1 + else: + filepaths.put(filepath) + filecount += 1 + print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" +writer = index.writer() +deleted = 0 +processed = 0 +for fields in searcher.all_stored_fields(): + path = fields['path'] + processed += 1 + if not os.path.exists(path): + writer.delete_by_term('path', path) + deleted += 1 + print u"\r{0} pages processed. {1} deleted".format(processed, deleted), +print "" + +writer.commit() +searcher.close() + class PDFWorker(Thread): def run(self): while True: @@ -85,31 +107,10 @@ print "all running" for thread in threads: thread.join() +idx.join() + oldindex = index index = None print "optimize index" oldindex.optimize() oldindex.close() - -""" - - try: - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - i=1 - content = "" - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer = index.writer() - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) - writer.commit() - except Exception,e: - print e -""" |