#!/usr/bin/env python3 from . import data import feedparser from newspaper import Article from newspaper.outputformatters import OutputFormatter from newspaper.cleaners import DocumentCleaner import logging def update(feed): logging.info('Update %s', feed.url) result = feedparser.parse(feed.url) if result['feed']['title'] != feed.title: feed.title = result['feed']['title'] feed.save() for entry in result['entries']: try: url = entry['link'] if 'id' in entry: uid = entry['id'] else: uid = entry['link'] item = data.Item.select() \ .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ .first() if not item: item = data.Item.create(feed=feed, uid=uid) paper = Article(entry['link']) paper.download() paper.parse() if item.title != paper.title: item.title = paper.title item.save() version = data.Version() version.item = item version.uid = uid version.url = entry['link'] authors = paper.authors authors.sort() version.authors = ', '.join(authors) version.title = paper.title version.text = paper.text # alternative, try if the rss body got a bigger text document_cleaner = DocumentCleaner(paper.config) output_formatter = OutputFormatter(paper.config) # override this method since it doesn't work on summaries output_formatter.links_to_text = lambda: None doc = paper.config.get_parser().fromstring(entry['summary']) doc = document_cleaner.clean(doc) text, article_html = output_formatter.get_formatted(doc) if len(text) < 2 and '<' not in entry['summary']: text = entry['summary'] if len(text) > len(version.text): version.text = text if len(version.text) < 2: # less than 2 chars is likely failure raise Exception('failed to parse {}\n{}'.format(entry,version)) # search if the previous version was the same ident_version = data.Version.select().where( (data.Version.item == version.item) & (data.Version.title == version.title) & (data.Version.authors == version.authors) & (data.Version.text == version.text)).first() if ident_version: logging.info('No change, skip %s', item.uid) else: version.save() logging.info('Saved new version of %s: %s', item.uid, version.id) except Exception as e: logging.exception('Failed to process %s', entry['link'])