#!/usr/bin/python2.6 # coding: utf-8 import os import sys import time import pyPdf import whoosh.index as index import whoosh.writing as writing import whoosh.fields as fields import multiprocessing as mp schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), path=fields.ID(stored=True), content=fields.TEXT(stored=True), ) schema_metadata = fields.Schema( title = fields.TEXT(stored=True), path=fields.ID(stored=True,unique=True), createtime=fields.NUMERIC(stored=True) ) if not os.path.exists(u"index"): create_index = True os.mkdir(u"index") index_book = index.create_in(u"index", schema_book, u"book") index_metadata = index.create_in(u"index", schema_metadata, u"metadata") else: create_index = False index_book = index.open_dir(u"index", u"book") index_metadata = index.open_dir(u"index", u"metadata") filepaths = [] directory = unicode(sys.argv[1], "utf8") searcher_book = index_book.searcher() searcher_metadata = index_metadata.searcher() print u"Walking {0}".format(directory) filecount = 0 skipped = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(u".pdf"): filepath = os.path.join(path, filename) if create_index or not searcher_metadata.document_number(path=filepath): filepaths.append(filepath) filecount += 1 else: #skip files that are already indexed skipped += 1 print u"\r{0} files found {1} skipped".format(filecount+skipped, skipped), print "" if not create_index: #update index for deleted files writer_book = index_book.writer() writer_metadata = index_metadata.writer() deleted = 0 processed = 0 for fields in searcher_metadata.all_stored_fields(): path = fields['path'] processed += 1 if not os.path.exists(path): writer_book.delete_by_term(u'path', path) writer_metadata.delete_by_term('path', path) deleted += 1 print u"\r{0} pages processed. {1} deleted".format(processed, deleted), print "" writer_book.commit() writer_metadata.commit() searcher_book.close() searcher_metadata.close() def process_file(filepath): try: print u"{0} processing {1}".format(os.getpid(), filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) writer_metadata.commit() writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 for page in inputfile.pages: print u"{0} processing {1} Page {2}".format(os.getpid(), filepath, pagenumber) content = page.extractText() writer_book.add_document(pagenumber=pagenumber, path=filepath, content=content) pagenumber += 1 writer_book.commit() except KeyboardInterrupt: return 'KeyboardInterrupt' except Exception,e: print u"{0} failed at {1}: {2}".format(os.getpid(), filepath, e) try: pool = mp.Pool() jobs = [] for filepath in filepaths: jobs.append( pool.apply_async( process_file, (filepath,) ) ) pool.close() pool.join() except KeyboardInterrupt: pool.terminate() except ImportError: for filepath in filepaths: if process_file(filepath) == "KeyboardInterrupt": break print u"optimize indexes" index_metadata.optimize() index_metadata.close() index_book.optimize() index_book.close()