blob: f7f70f33e6e5d2f8d17c03769c5ddc311888874a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
#!/usr/bin/python2.6
# coding: utf-8
import os
import sys
import pyPdf
from whoosh.index import create_in
import whoosh.fields as fields
import time
schema = fields.Schema(
title=fields.TEXT(stored=True),
path=fields.ID(stored=True),
content=fields.TEXT(stored=True),
createtime=fields.NUMERIC() )
if not os.path.exists("index"):
os.mkdir("index")
index = create_in(u"index", schema, "books")
writer = index.writer()
# extract
directory = "/tank/share/books/isbn"
try:
for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(".pdf"):
filepath = os.path.join(path, filename)
print u"Process {0}".format(filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
content = u""
i=1
numpages = inputfile.getNumPages()
for page in inputfile.pages:
sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
sys.stdout.flush()
content += page.extractText()
i+=1
print u""
writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
except KeyboardInterrupt:
writer.commit()
from whoosh.qparser import QueryParser
searcher = index.searcher()
query = QueryParser("content").parse("world")
results = searcher.search(query)
print results
|