indexer index /delete files

author: yvesf <yvesf-git@xapek.org> 2010-11-20 11:16:59 +0100
committer: yvesf <yvesf-git@xapek.org> 2010-11-20 11:16:59 +0100
commit: edae6bad0f61ae37593aef7efd5386525562c833 (patch)
tree: 75355f2a80497e3f07b1a1359092fcd84d60e54c /indexer.py
parent: bec8150c93a640ac4b6d1cebc86bd721dfdd6b59 (diff)
download: booksearch-edae6bad0f61ae37593aef7efd5386525562c833.tar.gz
booksearch-edae6bad0f61ae37593aef7efd5386525562c833.zip
1 files changed, 28 insertions, 27 deletions
diff --git a/indexer.py b/indexer.py
index 5a982a9..22f583e 100644
--- a/indexer.py
+++ b/indexer.py
@@ -24,20 +24,42 @@ else:
     index = open_dir("index")
 
 filepaths = Queue()
-documents = Queue()
+documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead
 notifier = Condition()
 
 directory = unicode(sys.argv[1], "utf8")
+searcher = index.searcher()
 print u"Walking {0}".format(directory)
 filecount = 0
+skipped = 0
 for path, directories, files in os.walk(directory):
     for filename in files:
         if filename.endswith(".pdf"):
-            filepaths.put(os.path.join(path, filename))
-            filecount += 1
-            print u"\r{0} files found".format(filecount),
+            filepath = os.path.join(path, filename)
+            docnum = searcher.document_number(path=filepath)
+            if not docnum:
+                skipped += 1
+            else:
+                filepaths.put(filepath)
+                filecount += 1
+            print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
 print ""
 
+writer = index.writer()
+deleted = 0
+processed = 0
+for fields in searcher.all_stored_fields():
+    path = fields['path']
+    processed += 1
+    if not os.path.exists(path):
+        writer.delete_by_term('path', path)
+        deleted += 1
+    print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
+print ""
+
+writer.commit()
+searcher.close()
+
 class PDFWorker(Thread):
     def run(self):
         while True:
@@ -85,31 +107,10 @@ print "all running"
 for thread in threads:
     thread.join()
 
+idx.join()
+
 oldindex = index
 index = None
 print "optimize index"
 oldindex.optimize()
 oldindex.close()
-
-"""
-
-            try:
-                filepath = os.path.join(path, filename)
-                print u"Process {0}".format(filepath)
-                inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
-                title = inputfile.getDocumentInfo().title
-                i=1
-                content = ""
-                numpages = inputfile.getNumPages()
-                for page in inputfile.pages:
-                    sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
-                    sys.stdout.flush()
-                    content += page.extractText()
-                    i+=1
-                print u""
-                writer = index.writer()
-                writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
-                writer.commit()
-            except Exception,e:
-                print e
-"""
author	yvesf <yvesf-git@xapek.org>	2010-11-20 11:16:59 +0100
committer	yvesf <yvesf-git@xapek.org>	2010-11-20 11:16:59 +0100
commit	edae6bad0f61ae37593aef7efd5386525562c833 (patch)
tree	75355f2a80497e3f07b1a1359092fcd84d60e54c /indexer.py
parent	bec8150c93a640ac4b6d1cebc86bd721dfdd6b59 (diff)
download	booksearch-edae6bad0f61ae37593aef7efd5386525562c833.tar.gz booksearch-edae6bad0f61ae37593aef7efd5386525562c833.zip