update INSTALL help; rename index->indexer; prettify web.py

author: yvesf <yvesf-git@xapek.org> 2010-11-20 02:12:50 +0100
committer: yvesf <yvesf-git@xapek.org> 2010-11-20 02:12:50 +0100
commit: bec8150c93a640ac4b6d1cebc86bd721dfdd6b59 (patch)
tree: cc7e02b211afa6ba7ba11432e00124dc617ce760 /index.py
parent: 966a17b12c9deab35ef0a804d9fa1faea9c8042d (diff)
download: booksearch-bec8150c93a640ac4b6d1cebc86bd721dfdd6b59.tar.gz
booksearch-bec8150c93a640ac4b6d1cebc86bd721dfdd6b59.zip
1 files changed, 0 insertions, 115 deletions
diff --git a/index.py b/index.py
deleted file mode 100644
index 5a982a9..0000000
--- a/index.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/python2.6
-# coding: utf-8
-import os
-import sys
-import pyPdf
-from whoosh.index import create_in, open_dir
-import whoosh.fields as fields
-import time
-from cStringIO import StringIO
-from Queue import Queue, Empty
-from threading import Thread, Condition
-
-schema = fields.Schema(
-    title=fields.TEXT(stored=True),
-    path=fields.ID(stored=True),
-    pagenumber=fields.NUMERIC(stored=True),
-    content=fields.TEXT(stored=True),
-    createtime=fields.NUMERIC() )
-
-if not os.path.exists("index"):
-    os.mkdir("index")
-    index = create_in(u"index", schema)
-else:
-    index = open_dir("index")
-
-filepaths = Queue()
-documents = Queue()
-notifier = Condition()
-
-directory = unicode(sys.argv[1], "utf8")
-print u"Walking {0}".format(directory)
-filecount = 0
-for path, directories, files in os.walk(directory):
-    for filename in files:
-        if filename.endswith(".pdf"):
-            filepaths.put(os.path.join(path, filename))
-            filecount += 1
-            print u"\r{0} files found".format(filecount),
-print ""
-
-class PDFWorker(Thread):
-    def run(self):
-        while True:
-            try:
-                filepath = filepaths.get(False)
-            except Empty:
-                break
-            try:
-                print u"{0} processing {1}".format(self.name, filepath)
-                inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
-                title = inputfile.getDocumentInfo().title
-                pagenumber = 0
-                for page in inputfile.pages:
-                    pagenumber += 1
-                    content = page.extractText()
-                    documents.put( {"title":title, "path":filepath, "pagenumber":pagenumber, "content":content, "createtime":time.time() } )
-            except Exception, e:
-                print u"{0} Exception: {1}".format(self.name, str(e))
-            finally:
-                print u"{0} finished   {1}".format(self.name, filepath)
-                filepaths.task_done()
-
-class IndexWorker(Thread):
-    def run(self):
-        while index != None:
-            try:
-                doc = documents.get(True, 0.5)
-            except Empty:
-                continue
-            print u"{0} adding     {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
-            writer = index.writer()
-            writer.add_document(**doc)
-            writer.commit()
-            documents.task_done()
-            print u"{0} added      {1} page {2}".format(self.name, doc['path'], doc['pagenumber'])
-
-threads = map(lambda i: PDFWorker(), range(1))
-for thread in threads:
-    thread.start()
-
-idx = IndexWorker()
-idx.start()
-print "all running" 
-
-for thread in threads:
-    thread.join()
-
-oldindex = index
-index = None
-print "optimize index"
-oldindex.optimize()
-oldindex.close()
-
-"""
-
-            try:
-                filepath = os.path.join(path, filename)
-                print u"Process {0}".format(filepath)
-                inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
-                title = inputfile.getDocumentInfo().title
-                i=1
-                content = ""
-                numpages = inputfile.getNumPages()
-                for page in inputfile.pages:
-                    sys.stdout.write("\rPage {0}/{1}".format(i, numpages))
-                    sys.stdout.flush()
-                    content += page.extractText()
-                    i+=1
-                print u""
-                writer = index.writer()
-                writer.add_document(title=title, path=filepath, content=content, createtime=time.time())
-                writer.commit()
-            except Exception,e:
-                print e
-"""
author	yvesf <yvesf-git@xapek.org>	2010-11-20 02:12:50 +0100
committer	yvesf <yvesf-git@xapek.org>	2010-11-20 02:12:50 +0100
commit	bec8150c93a640ac4b6d1cebc86bd721dfdd6b59 (patch)
tree	cc7e02b211afa6ba7ba11432e00124dc617ce760 /index.py
parent	966a17b12c9deab35ef0a804d9fa1faea9c8042d (diff)
download	booksearch-bec8150c93a640ac4b6d1cebc86bd721dfdd6b59.tar.gz booksearch-bec8150c93a640ac4b6d1cebc86bd721dfdd6b59.zip