diff options
author | yvesf <yvesf-git@xapek.org> | 2010-11-19 23:31:30 +0100 |
---|---|---|
committer | yvesf <yvesf-git@xapek.org> | 2010-11-19 23:31:30 +0100 |
commit | dac88861a9635504af3813f57c443040b7416163 (patch) | |
tree | 415c268e6d4611b8f49501abcdfab51e62770605 /query.py | |
parent | 0d9a0f4a893e3060f960ec52ccf3effd2ea43674 (diff) | |
download | booksearch-dac88861a9635504af3813f57c443040b7416163.tar.gz booksearch-dac88861a9635504af3813f57c443040b7416163.zip |
threading
Diffstat (limited to 'query.py')
-rw-r--r-- | query.py | 57 |
1 files changed, 12 insertions, 45 deletions
@@ -1,54 +1,21 @@ #!/usr/bin/python2.6 # coding: utf-8 -import os -import sys -import pyPdf -from whoosh.index import create_in -import whoosh.fields as fields -import time - -schema = fields.Schema( - title=fields.TEXT(stored=True), - path=fields.ID(stored=True), - content=fields.TEXT(stored=True), - createtime=fields.NUMERIC() ) - -if not os.path.exists("index"): - os.mkdir("index") - -index = create_in(u"index", schema, "books") -writer = index.writer() +from whoosh.index import open_dir +from whoosh.qparser import QueryParser +index = open_dir(u"index", mapped=False) -# extract -directory = "/tank/share/books/isbn" -try: - for path, directories, files in os.walk(directory): - for filename in files: - if filename.endswith(".pdf"): - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - content = u"" - i=1 - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) -except KeyboardInterrupt: - writer.commit() +searcher = index.searcher() -from whoosh.qparser import QueryParser +while True: + term = raw_input("query> ") + query = QueryParser("content").parse(term) + print query -searcher = index.searcher() + results = searcher.search(query) + for result in results: + print "Match in {0}".format(result.get("path")) -query = QueryParser("content").parse("world") + print "{0} results".format(len(results)) -results = searcher.search(query) -print results |