blob: 631460dd681573adb2088f2f16b22c34a79a869b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#!/usr/bin/python2.6
# coding: utf-8
import os
import sys
import pyPdf
from whoosh.index import create_in
import whoosh.fields as fields
import time
schema = fields.Schema(
title=fields.TEXT(stored=True),
path=fields.ID(stored=True),
content=fields.TEXT(stored=True),
createtime=fields.NUMERIC() )
index = create_in("index", schema, "books")
writer = index.writer()
# extract
directory = u"/media/share/books/isbn"
try:
for path, directories, files in os.walk(directory):
for filename in files:
if filename.endswith(".pdf"):
filepath = os.path.join(path, filename)
print u"Process {0}".format(filepath)
inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
title = inputfile.getDocumentInfo().title
content = u""
i=1
numpages = inputfile.getNumPages()
for page in inputfile.pages:
sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
sys.stdout.flush()
content += page.extractText()
i+=1
print u""
writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
except KeyboardInterrupt:
writer.commit()
from whoosh.qparser import QueryParser
searcher = index.searcher()
query = QueryParser("content").parse("world")
results = searcher.search(query)
print results
|