summaryrefslogtreecommitdiff
path: root/indexer.py
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-26 23:59:59 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-26 23:59:59 +0100
commit15f17b4fce5a1e4b70709aac569c18d8cf57deba (patch)
tree89eebcc47aebed71cc86eedfaee8cf894e47c07e /indexer.py
parent9122cfed447a643a8c383be5558fd82fc3be7cc7 (diff)
downloadbooksearch-15f17b4fce5a1e4b70709aac569c18d8cf57deba.tar.gz
booksearch-15f17b4fce5a1e4b70709aac569c18d8cf57deba.zip
add pdfminer for text extraction;
replaced all ' with " metadata hacking
Diffstat (limited to 'indexer.py')
-rw-r--r--indexer.py97
1 files changed, 87 insertions, 10 deletions
diff --git a/indexer.py b/indexer.py
index 913591a..3ec1e8b 100644
--- a/indexer.py
+++ b/indexer.py
@@ -3,11 +3,90 @@
import os
import sys
import time
-import pyPdf
import whoosh.index as index
import whoosh.writing as writing
import whoosh.fields as fields
from compat import str_format
+import StringIO
+
+
+def pdf_extract_metadata(filepath):
+ from pdfminer.pdfparser import PDFParser, PDFDocument
+ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+ from pdfminer.converter import TextConverter
+ from pdfminer.layout import LAParams
+ from lxml import etree
+
+ outbuf = StringIO.StringIO()
+ rsrcmgr = PDFResourceManager()
+ device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams())
+ parser = PDFParser(file(filepath, "rb"))
+ doc = PDFDocument()
+
+ parser.set_document(doc)
+ doc.set_parser(parser)
+ doc.initialize("")
+
+ namespaces={
+ "dc":"http://purl.org/dc/elements/1.1/",
+ "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+ "pdf":"http://ns.adobe.com/pdf/1.3/", }
+
+ if doc.catalog.has_key("Metadata"):
+ obj_ref = doc.catalog["Metadata"]
+ obj_stream = obj_ref.resolve()
+ if obj_stream.attrs['Subtype'].name == "XML":
+ obj_data = obj_stream.get_data()
+ if obj_data.endswith("\nf"):
+ obj_data = obj_data[:-len("\nf")]
+ print obj_data
+ tree = etree.parse(StringIO.StringIO(obj_data))
+ print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text
+ return obj_data
+ else:
+ return None
+
+def pdf_extract_text_pdfminer(filepath):
+ from pdfminer.pdfparser import PDFParser, PDFDocument
+ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+ from pdfminer.converter import TextConverter
+ from pdfminer.layout import LAParams
+
+ outbuf = StringIO.StringIO()
+ rsrcmgr = PDFResourceManager()
+ device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams())
+ parser = PDFParser(file(filepath, "rb"))
+ doc = PDFDocument()
+
+ parser.set_document(doc)
+ doc.set_parser(parser)
+ doc.initialize("")
+
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ for pagenumber, page in enumerate(doc.get_pages()):
+ interpreter.process_page(page)
+ outbuf.seek(0)
+ content = unicode(outbuf.read(),"utf-8",errors="replace")
+ yield (pagenumber+1, content) #start pages at 1
+ outbuf.seek(0)
+
+def pdf_extract_text_pypdf(filepath):
+ import pyPdf
+ inputfile = pyPdf.PdfFileReader(file(filepath, "r"))
+
+ pagenumber = 1
+ for page in inputfile.pages:
+ content = page.extractText()
+ yield (pagenumber, content)
+ pagenumber += 1
+
+""" Yields (pagenumber, text) """
+def pdf_extract_text(filepath):
+ try:
+ return pdf_extract_text_pdfminer(filepath)
+ except ImportError:
+ print "Fallback to pypdf"
+ return pdf_extract_text_pypdf(filepath)
schema_book = fields.Schema(
pagenumber=fields.NUMERIC(stored=True),
@@ -56,11 +135,11 @@ if not create_index: #update index for deleted files
deleted = 0
processed = 0
for fields in searcher_metadata.all_stored_fields():
- path = fields['path']
+ path = fields["path"]
processed += 1
if not os.path.exists(path):
- writer_book.delete_by_term(u'path', path)
- writer_metadata.delete_by_term('path', path)
+ writer_book.delete_by_term(u"path", path)
+ writer_metadata.delete_by_term("path", path)
deleted += 1
print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted),
print ""
@@ -73,8 +152,7 @@ searcher_metadata.close()
def process_file(filepath):
try:
print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
+ title = u"notimplemented"
writer_metadata = index_metadata.writer()
writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
@@ -82,9 +160,8 @@ def process_file(filepath):
writer_book = writing.BatchWriter(index_book, limit=1000)
pagenumber = 1
- for page in inputfile.pages:
+ for pagenumber, content in pdf_extract_text(filepath):
print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber)
- content = page.extractText()
writer_book.add_document(pagenumber=pagenumber,
path=filepath,
content=content)
@@ -92,9 +169,9 @@ def process_file(filepath):
writer_book.commit()
except KeyboardInterrupt:
- return 'KeyboardInterrupt'
+ return "KeyboardInterrupt"
except Exception,e:
- print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e)
+ print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=str(e))
try:
import multiprocessing as mp