From 0d9a0f4a893e3060f960ec52ccf3effd2ea43674 Mon Sep 17 00:00:00 2001 From: User Date: Fri, 19 Nov 2010 21:47:48 +0100 Subject: blah --- index.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ query.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 index.py create mode 100644 query.py diff --git a/index.py b/index.py new file mode 100644 index 0000000..f7f70f3 --- /dev/null +++ b/index.py @@ -0,0 +1,54 @@ +#!/usr/bin/python2.6 +# coding: utf-8 +import os +import sys +import pyPdf +from whoosh.index import create_in +import whoosh.fields as fields +import time + +schema = fields.Schema( + title=fields.TEXT(stored=True), + path=fields.ID(stored=True), + content=fields.TEXT(stored=True), + createtime=fields.NUMERIC() ) + +if not os.path.exists("index"): + os.mkdir("index") + +index = create_in(u"index", schema, "books") +writer = index.writer() + + +# extract +directory = "/tank/share/books/isbn" + +try: + for path, directories, files in os.walk(directory): + for filename in files: + if filename.endswith(".pdf"): + filepath = os.path.join(path, filename) + print u"Process {0}".format(filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + content = u"" + i=1 + numpages = inputfile.getNumPages() + for page in inputfile.pages: + sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) + sys.stdout.flush() + content += page.extractText() + i+=1 + print u"" + writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) +except KeyboardInterrupt: + writer.commit() + +from whoosh.qparser import QueryParser + +searcher = index.searcher() + +query = QueryParser("content").parse("world") + +results = searcher.search(query) +print results diff --git a/query.py b/query.py new file mode 100644 index 0000000..f7f70f3 --- /dev/null +++ b/query.py @@ -0,0 +1,54 @@ +#!/usr/bin/python2.6 +# coding: utf-8 +import os +import sys +import pyPdf +from whoosh.index import create_in +import whoosh.fields as fields +import time + +schema = fields.Schema( + title=fields.TEXT(stored=True), + path=fields.ID(stored=True), + content=fields.TEXT(stored=True), + createtime=fields.NUMERIC() ) + +if not os.path.exists("index"): + os.mkdir("index") + +index = create_in(u"index", schema, "books") +writer = index.writer() + + +# extract +directory = "/tank/share/books/isbn" + +try: + for path, directories, files in os.walk(directory): + for filename in files: + if filename.endswith(".pdf"): + filepath = os.path.join(path, filename) + print u"Process {0}".format(filepath) + inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) + title = inputfile.getDocumentInfo().title + content = u"" + i=1 + numpages = inputfile.getNumPages() + for page in inputfile.pages: + sys.stdout.write("\rPage {0}/{1}".format(i, numpages)) + sys.stdout.flush() + content += page.extractText() + i+=1 + print u"" + writer.add_document(title=title, path=filepath, content=content, createtime=time.time()) +except KeyboardInterrupt: + writer.commit() + +from whoosh.qparser import QueryParser + +searcher = index.searcher() + +query = QueryParser("content").parse("world") + +results = searcher.search(query) +print results -- cgit v1.2.1