#!/usr/bin/python2.6 # coding: utf-8 import os import sys import pyPdf from whoosh.index import create_in import whoosh.fields as fields import time schema = fields.Schema( title=fields.TEXT(stored=True), path=fields.ID(stored=True), content=fields.TEXT(stored=True), createtime=fields.NUMERIC() ) if not os.path.exists("index"): os.mkdir("index") index = create_in(u"index", schema, "books") writer = index.writer() # extract directory = "/tank/share/books/isbn" try: for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(".pdf"): filepath = os.path.join(path, filename) print u"Process {0}".format(filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title content = u"" i=1 numpages = inputfile.getNumPages() for page in inputfile.pages: sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) sys.stdout.flush() content += page.extractText() i+=1 print u"" writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) except KeyboardInterrupt: writer.commit() from whoosh.qparser import QueryParser searcher = index.searcher() query = QueryParser("content").parse("world") results = searcher.search(query) print results