diff options
Diffstat (limited to 'quick.py')
-rw-r--r-- | quick.py | 51 |
1 files changed, 0 insertions, 51 deletions
diff --git a/quick.py b/quick.py deleted file mode 100644 index 631460d..0000000 --- a/quick.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/python2.6 -# coding: utf-8 -import os -import sys -import pyPdf -from whoosh.index import create_in -import whoosh.fields as fields -import time - -schema = fields.Schema( - title=fields.TEXT(stored=True), - path=fields.ID(stored=True), - content=fields.TEXT(stored=True), - createtime=fields.NUMERIC() ) - -index = create_in("index", schema, "books") -writer = index.writer() - - -# extract -directory = u"/media/share/books/isbn" - -try: - for path, directories, files in os.walk(directory): - for filename in files: - if filename.endswith(".pdf"): - filepath = os.path.join(path, filename) - print u"Process {0}".format(filepath) - inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) - title = inputfile.getDocumentInfo().title - content = u"" - i=1 - numpages = inputfile.getNumPages() - for page in inputfile.pages: - sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) - sys.stdout.flush() - content += page.extractText() - i+=1 - print u"" - writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) -except KeyboardInterrupt: - writer.commit() - -from whoosh.qparser import QueryParser - -searcher = index.searcher() - -query = QueryParser("content").parse("world") - -results = searcher.search(query) -print results |