summaryrefslogtreecommitdiff
path: root/query.py
diff options
context:
space:
mode:
Diffstat (limited to 'query.py')
-rw-r--r--query.py57
1 files changed, 12 insertions, 45 deletions
diff --git a/query.py b/query.py
index f7f70f3..b9033f3 100644
--- a/query.py
+++ b/query.py
@@ -1,54 +1,21 @@
#!/usr/bin/python2.6
# coding: utf-8
-import os
-import sys
-import pyPdf
-from whoosh.index import create_in
-import whoosh.fields as fields
-import time
-
-schema = fields.Schema(
- title=fields.TEXT(stored=True),
- path=fields.ID(stored=True),
- content=fields.TEXT(stored=True),
- createtime=fields.NUMERIC() )
-
-if not os.path.exists("index"):
- os.mkdir("index")
-
-index = create_in(u"index", schema, "books")
-writer = index.writer()
+from whoosh.index import open_dir
+from whoosh.qparser import QueryParser
+index = open_dir(u"index", mapped=False)
-# extract
-directory = "/tank/share/books/isbn"
-try:
- for path, directories, files in os.walk(directory):
- for filename in files:
- if filename.endswith(".pdf"):
- filepath = os.path.join(path, filename)
- print u"Process {0}".format(filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
- content = u""
- i=1
- numpages = inputfile.getNumPages()
- for page in inputfile.pages:
- sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
- sys.stdout.flush()
- content += page.extractText()
- i+=1
- print u""
- writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
-except KeyboardInterrupt:
- writer.commit()
+searcher = index.searcher()
-from whoosh.qparser import QueryParser
+while True:
+ term = raw_input("query> ")
+ query = QueryParser("content").parse(term)
+ print query
-searcher = index.searcher()
+ results = searcher.search(query)
+ for result in results:
+ print "Match in {0}".format(result.get("path"))
-query = QueryParser("content").parse("world")
+ print "{0} results".format(len(results))
-results = searcher.search(query)
-print results