summaryrefslogtreecommitdiff
path: root/quick.py
diff options
context:
space:
mode:
Diffstat (limited to 'quick.py')
-rw-r--r--quick.py51
1 files changed, 0 insertions, 51 deletions
diff --git a/quick.py b/quick.py
deleted file mode 100644
index 631460d..0000000
--- a/quick.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/python2.6
-# coding: utf-8
-import os
-import sys
-import pyPdf
-from whoosh.index import create_in
-import whoosh.fields as fields
-import time
-
-schema = fields.Schema(
- title=fields.TEXT(stored=True),
- path=fields.ID(stored=True),
- content=fields.TEXT(stored=True),
- createtime=fields.NUMERIC() )
-
-index = create_in("index", schema, "books")
-writer = index.writer()
-
-
-# extract
-directory = u"/media/share/books/isbn"
-
-try:
- for path, directories, files in os.walk(directory):
- for filename in files:
- if filename.endswith(".pdf"):
- filepath = os.path.join(path, filename)
- print u"Process {0}".format(filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
- content = u""
- i=1
- numpages = inputfile.getNumPages()
- for page in inputfile.pages:
- sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
- sys.stdout.flush()
- content += page.extractText()
- i+=1
- print u""
- writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
-except KeyboardInterrupt:
- writer.commit()
-
-from whoosh.qparser import QueryParser
-
-searcher = index.searcher()
-
-query = QueryParser("content").parse("world")
-
-results = searcher.search(query)
-print results