#!/usr/bin/env python3 import data import web import feedparser from newspaper import Article from newspaper.outputformatters import OutputFormatter from newspaper.cleaners import DocumentCleaner import os import argparse import logging logging.basicConfig(level=logging.INFO) def update(feed): logging.info("Update %s", feed.url) result = feedparser.parse(feed.url) if result['feed']['title'] != feed.title: feed.title = result['feed']['title'] feed.save() for entry in result['entries']: try: url = entry['link'] if 'id' in entry: uid = entry['id'] else: uid = entry['link'] item = data.Item.select() \ .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ .first() if not item: item = data.Item.create(feed=feed, uid=uid) paper = Article(entry['link']) paper.download() paper.parse() if item.title != paper.title: item.title = paper.title item.save() version = data.Version() version.item = item version.uid = uid version.url = entry['link'] authors = paper.authors authors.sort() version.authors = ", ".join(authors) version.title = paper.title version.text = paper.text # alternative, try if the rss body got a bigger text document_cleaner = DocumentCleaner(paper.config) output_formatter = OutputFormatter(paper.config) # override this method since it doesn't work on summaries output_formatter.links_to_text = lambda: None doc = paper.config.get_parser().fromstring(entry['summary']) doc = document_cleaner.clean(doc) text, article_html = output_formatter.get_formatted(doc) if len(text) < 2 and '<' not in entry['summary']: text = entry['summary'] if len(text) > len(version.text): version.text = text if len(version.text) < 2: # less than 2 chars is likely failure raise Exception("failed to parse {}\n{}".format(entry,version)) # search if the previous version was the same ident_version = data.Version.select().where( (data.Version.item == version.item) & (data.Version.title == version.title) & (data.Version.authors == version.authors) & (data.Version.text == version.text)).first() if ident_version: logging.info("No change, skip %s", item.uid) else: version.save() logging.info("Saved new version of %s: %s", item.uid, version.id) except Exception as e: logging.exception("Failed to process %s", entry['link']) if __name__ == "__main__": db = data.SqliteDatabase(os.path.join( os.path.dirname(__file__), "db.sqlite")) data.db.initialize(db) data.db.connect() data.db.create_tables(filter(lambda t: not t.table_exists(), data.BaseModel.__subclasses__())) parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--add', metavar='URL', help='add feed with URL') parser.add_argument('--update', action='store_true', help='Update all known feeds') parser.add_argument('--update-feed', type=int, metavar='ID', help='Update feed with ID') parser.add_argument('--remove', type=int, metavar='ID', help='Remove feed with ID') parser.add_argument('--list', action='store_true', help='List all registered feeds') parser.add_argument('--web', action='store_true', help='Run web interface') args = parser.parse_args() if args.update: for feed in data.Feed.select(): try: update(feed) except Exception as e: logging.exception("Failed to update %s", feed.url) if args.update_feed: feed = data.Feed.get(data.Feed.id == args.update_feed) if feed: update(feed) if args.add: feed = data.Feed.create(url=args.add) print("Added {}".format(feed)) if args.remove: feed = data.Feed.get(data.Feed.id == args.remove) feed.delete_instance() print("Removed {}".format(feed)) if args.list: for feed in data.Feed.select().order_by(data.Feed.created_date): print("{0.id} - {1} - {0.title} - {0.url}".format( feed, feed.created_date.strftime('%x %X'))) if args.web: web.run()