summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--LOG7
-rw-r--r--quick.py51
2 files changed, 58 insertions, 0 deletions
diff --git a/LOG b/LOG
new file mode 100644
index 0000000..e850077
--- /dev/null
+++ b/LOG
@@ -0,0 +1,7 @@
+virtualenv create booksearch_env
+cd booksearch_env
+. ./bin/activate
+easy_install pip
+pip install whoosh
+pip install pypdf
+
diff --git a/quick.py b/quick.py
new file mode 100644
index 0000000..631460d
--- /dev/null
+++ b/quick.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python2.6
+# coding: utf-8
+import os
+import sys
+import pyPdf
+from whoosh.index import create_in
+import whoosh.fields as fields
+import time
+
+schema = fields.Schema(
+ title=fields.TEXT(stored=True),
+ path=fields.ID(stored=True),
+ content=fields.TEXT(stored=True),
+ createtime=fields.NUMERIC() )
+
+index = create_in("index", schema, "books")
+writer = index.writer()
+
+
+# extract
+directory = u"/media/share/books/isbn"
+
+try:
+ for path, directories, files in os.walk(directory):
+ for filename in files:
+ if filename.endswith(".pdf"):
+ filepath = os.path.join(path, filename)
+ print u"Process {0}".format(filepath)
+ inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ title = inputfile.getDocumentInfo().title
+ content = u""
+ i=1
+ numpages = inputfile.getNumPages()
+ for page in inputfile.pages:
+ sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
+ sys.stdout.flush()
+ content += page.extractText()
+ i+=1
+ print u""
+ writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
+except KeyboardInterrupt:
+ writer.commit()
+
+from whoosh.qparser import QueryParser
+
+searcher = index.searcher()
+
+query = QueryParser("content").parse("world")
+
+results = searcher.search(query)
+print results