summaryrefslogtreecommitdiff
path: root/indexer.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-20 18:20:32 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-20 18:20:32 +0100
commitf78911646cf53e37c47921f9dcd9702d6e946f54 (patch)
tree013c97eeae02c9abfdfa205ff4e6258a120464a0 /indexer.py
parentd66c0d42b678408921f5c3cb43c18df5b1f87338 (diff)
downloadbooksearch-f78911646cf53e37c47921f9dcd9702d6e946f54.tar.gz
booksearch-f78911646cf53e37c47921f9dcd9702d6e946f54.zip
ergebnisliste unbefriedigend
Diffstat (limited to 'indexer.py')
-rw-r--r--indexer.py35
1 files changed, 19 insertions, 16 deletions
diff --git a/indexer.py b/indexer.py
index 22f583e..ac14a9e 100644
--- a/indexer.py
+++ b/indexer.py
@@ -18,9 +18,11 @@ schema = fields.Schema(
createtime=fields.NUMERIC() )
if not os.path.exists("index"):
+ create = True
os.mkdir("index")
index = create_in(u"index", schema)
else:
+ create = False
index = open_dir("index")
filepaths = Queue()
@@ -36,7 +38,7 @@ for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(".pdf"):
filepath = os.path.join(path, filename)
- docnum = searcher.document_number(path=filepath)
+ docnum = create or searcher.document_number(path=filepath)
if not docnum:
skipped += 1
else:
@@ -45,19 +47,20 @@ for path, directories, files in os.walk(directory):
print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
print ""
-writer = index.writer()
-deleted = 0
-processed = 0
-for fields in searcher.all_stored_fields():
- path = fields['path']
- processed += 1
- if not os.path.exists(path):
- writer.delete_by_term('path', path)
- deleted += 1
- print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
-print ""
+if not create:
+ writer = index.writer()
+ deleted = 0
+ processed = 0
+ for fields in searcher.all_stored_fields():
+ path = fields['path']
+ processed += 1
+ if not os.path.exists(path):
+ writer.delete_by_term('path', path)
+ deleted += 1
+ print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
+ print ""
+ writer.commit()
-writer.commit()
searcher.close()
class PDFWorker(Thread):
@@ -73,6 +76,7 @@ class PDFWorker(Thread):
title = inputfile.getDocumentInfo().title
pagenumber = 0
for page in inputfile.pages:
+ print u"{0} processing {1} Page {2}".format(self.name, filepath, pagenumber)
pagenumber += 1
content = page.extractText()
documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
@@ -87,7 +91,7 @@ class IndexWorker(Thread):
while index != None:
try:
doc = documents.get(True, 0.5)
- except Empty:
+ except Empty,e:
continue
print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
writer = index.writer()
@@ -107,10 +111,9 @@ print "all running"
for thread in threads:
thread.join()
-idx.join()
-
oldindex = index
index = None
+idx.join()
print "optimize index"
oldindex.optimize()
oldindex.close()