1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#!/usr/bin/python2.6
# coding: utf-8
import os
import sys
import pyPdf
from whoosh.index import create_in, open_dir
import whoosh.fields as fields
import time
from cStringIO import StringIO
from Queue import Queue, Empty
from threading import Thread, Condition
schema = fields.Schema(
title=fields.TEXT(stored=True),
path=fields.ID(stored=True),
pagenumber=fields.NUMERIC(stored=True),
content=fields.TEXT(stored=True),
createtime=fields.NUMERIC() )
if not os.path.exists("index"):
create = True
os.mkdir("index")
index = create_in(u"index", schema)
else:
create = False
index = open_dir("index")
filepaths = Queue()
documents = Queue(maxsize=5) #PDFWorker should be maximal 5 documents ahead
notifier = Condition()
directory = unicode(sys.argv[1], "utf8")
searcher = index.searcher()
print u"Walking {0}".format(directory)
filecount = 0
skipped = 0
for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(".pdf"):
filepath = os.path.join(path, filename)
docnum = create or searcher.document_number(path=filepath)
if not docnum:
skipped += 1
else:
filepaths.put(filepath)
filecount += 1
print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped),
print ""
if not create:
writer = index.writer()
deleted = 0
processed = 0
for fields in searcher.all_stored_fields():
path = fields['path']
processed += 1
if not os.path.exists(path):
writer.delete_by_term('path', path)
deleted += 1
print u"\r{0} pages processed. {1} deleted".format(processed, deleted),
print ""
writer.commit()
searcher.close()
class PDFWorker(Thread):
def run(self):
while True:
try:
filepath = filepaths.get(False)
except Empty:
break
try:
print u"{0} processing {1}".format(self.name, filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
pagenumber = 0
for page in inputfile.pages:
print u"{0} processing {1} Page {2}".format(self.name, filepath, pagenumber)
pagenumber += 1
content = page.extractText()
documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
except Exception, e:
print u"{0} Exception: {1}".format(self.name, str(e))
finally:
print u"{0} finished {1}".format(self.name, filepath)
filepaths.task_done()
class IndexWorker(Thread):
def run(self):
while index != None:
try:
doc = documents.get(True, 0.5)
except Empty,e:
continue
print u"{0} adding {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
writer = index.writer()
writer.add_document(**doc)
writer.commit()
documents.task_done()
print u"{0} added {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
threads = map(lambda i: PDFWorker(), range(1))
for thread in threads:
thread.start()
idx = IndexWorker()
idx.start()
print "all running"
for thread in threads:
thread.join()
oldindex = index
index = None
idx.join()
print "optimize index"
oldindex.optimize()
oldindex.close()
|