#!/usr/bin/python2.6 # coding: utf-8 import os import sys import time import pyPdf import whoosh.index as index import whoosh.writing as writing import whoosh.fields as fields from compat import str_format schema_book = fields.Schema( pagenumber=fields.NUMERIC(stored=True), path=fields.ID(stored=True), content=fields.TEXT(stored=True), ) schema_metadata = fields.Schema( title = fields.TEXT(stored=True), path=fields.ID(stored=True,unique=True), createtime=fields.NUMERIC(stored=True) ) if not os.path.exists(u"index"): create_index = True os.mkdir(u"index") index_book = index.create_in(u"index", schema_book, u"book") index_metadata = index.create_in(u"index", schema_metadata, u"metadata") else: create_index = False index_book = index.open_dir(u"index", u"book") index_metadata = index.open_dir(u"index", u"metadata") filepaths = [] directory = unicode(sys.argv[1], "utf8") searcher_book = index_book.searcher() searcher_metadata = index_metadata.searcher() print str_format(u"Walking {dir}",dir=directory) filecount = 0 skipped = 0 for path, directories, files in os.walk(directory): for filename in files: if filename.endswith(u".pdf"): filepath = os.path.join(path, filename) if create_index or not searcher_metadata.document_number(path=filepath): filepaths.append(filepath) filecount += 1 else: #skip files that are already indexed skipped += 1 print str_format(u"\r{count} files found {skip} skipped", count=filecount+skipped, skip=skipped), print "" if not create_index: #update index for deleted files writer_book = index_book.writer() writer_metadata = index_metadata.writer() deleted = 0 processed = 0 for fields in searcher_metadata.all_stored_fields(): path = fields['path'] processed += 1 if not os.path.exists(path): writer_book.delete_by_term(u'path', path) writer_metadata.delete_by_term('path', path) deleted += 1 print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted), print "" writer_book.commit() writer_metadata.commit() searcher_book.close() searcher_metadata.close() def process_file(filepath): try: print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath) inputfile = pyPdf.PdfFileReader(file(filepath, 'r')) title = inputfile.getDocumentInfo().title writer_metadata = index_metadata.writer() writer_metadata.add_document(title=title, path=filepath, createtime=time.time()) writer_metadata.commit() writer_book = writing.BatchWriter(index_book, limit=1000) pagenumber = 1 for page in inputfile.pages: print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber) content = page.extractText() writer_book.add_document(pagenumber=pagenumber, path=filepath, content=content) pagenumber += 1 writer_book.commit() except KeyboardInterrupt: return 'KeyboardInterrupt' except Exception,e: print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e) try: import multiprocessing as mp pool = mp.Pool() jobs = [] for filepath in filepaths: jobs.append( pool.apply_async( process_file, (filepath,) ) ) pool.close() pool.join() except KeyboardInterrupt: pool.terminate() except ImportError: for filepath in filepaths: if process_file(filepath) == "KeyboardInterrupt": break print u"optimize indexes" index_metadata.optimize() index_metadata.close() index_book.optimize() index_book.close()