summaryrefslogtreecommitdiff
path: root/indexer.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-20 11:16:59 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-20 11:16:59 +0100
commitedae6bad0f61ae37593aef7efd5386525562c833 (patch)
tree75355f2a80497e3f07b1a1359092fcd84d60e54c /indexer.py
parentbec8150c93a640ac4b6d1cebc86bd721dfdd6b59 (diff)
downloadbooksearch-edae6bad0f61ae37593aef7efd5386525562c833.tar.gz
booksearch-edae6bad0f61ae37593aef7efd5386525562c833.zip
indexer index /delete files
Diffstat (limited to 'indexer.py')
-rw-r--r--indexer.py55
1 files changed, 28 insertions, 27 deletions
diff --git a/indexer.py b/indexer.py
index 5a982a9..22f583e 100644
--- a/indexer.py
+++ b/indexer.py
@@ -24,20 +24,42 @@ else:
index = open_dir("index")
filepaths = Queue()
-documents = Queue()
+documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead
notifier = Condition()
directory = unicode(sys.argv[1], "utf8")
+searcher = index.searcher()
print u"Walking {0}".format(directory)
filecount = 0
+skipped = 0
for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(".pdf"):
- filepaths.put(os.path.join(path, filename))
- filecount += 1
- print u"\r{0} files found".format(filecount),
+ filepath = os.path.join(path, filename)
+ docnum = searcher.document_number(path=filepath)
+ if not docnum:
+ skipped += 1
+ else:
+ filepaths.put(filepath)
+ filecount += 1
+ print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
print ""
+writer = index.writer()
+deleted = 0
+processed = 0
+for fields in searcher.all_stored_fields():
+ path = fields['path']
+ processed += 1
+ if not os.path.exists(path):
+ writer.delete_by_term('path', path)
+ deleted += 1
+ print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
+print ""
+
+writer.commit()
+searcher.close()
+
class PDFWorker(Thread):
def run(self):
while True:
@@ -85,31 +107,10 @@ print "all running"
for thread in threads:
thread.join()
+idx.join()
+
oldindex = index
index = None
print "optimize index"
oldindex.optimize()
oldindex.close()
-
-"""
-
- try:
- filepath = os.path.join(path, filename)
- print u"Process {0}".format(filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
- i=1
- content = ""
- numpages = inputfile.getNumPages()
- for page in inputfile.pages:
- sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
- sys.stdout.flush()
- content += page.extractText()
- i+=1
- print u""
- writer = index.writer()
- writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
- writer.commit()
- except Exception,e:
- print e
-"""