From 76b682ac3073638fda5d6caa23594dd56bf6f06d Mon Sep 17 00:00:00 2001 From: Yves Fischer Date: Sat, 21 Nov 2015 13:27:51 +0100 Subject: a functional draft --- watchnews.py | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100755 watchnews.py (limited to 'watchnews.py') diff --git a/watchnews.py b/watchnews.py new file mode 100755 index 0000000..e22b483 --- /dev/null +++ b/watchnews.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +import data +import web + +import feedparser +from newspaper import Article +from newspaper.outputformatters import OutputFormatter +from newspaper.cleaners import DocumentCleaner + +import os +import argparse +import logging + + +logging.basicConfig(level=logging.INFO) + +def update(feed): + logging.info("Update %s", feed.url) + result = feedparser.parse(feed.url) + if result['feed']['title'] != feed.title: + feed.title = result['feed']['title'] + feed.save() + + for entry in result['entries']: + try: + url = entry['link'] + if 'id' in entry: + uid = entry['id'] + else: + uid = entry['link'] + + item = data.Item.select() \ + .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ + .first() + if not item: + item = data.Item.create(feed=feed, uid=uid) + + + paper = Article(entry['link']) + paper.download() + paper.parse() + + if item.title != paper.title: + item.title = paper.title + item.save() + + version = data.Version() + version.item = item + version.uid = uid + version.url = entry['link'] + authors = paper.authors + authors.sort() + version.authors = ", ".join(authors) + version.title = paper.title + version.text = paper.text + + # alternative, try if the rss body got a bigger text + document_cleaner = DocumentCleaner(paper.config) + output_formatter = OutputFormatter(paper.config) + # override this method since it doesn't work on summaries + output_formatter.links_to_text = lambda: None + doc = paper.config.get_parser().fromstring(entry['summary']) + doc = document_cleaner.clean(doc) + text, article_html = output_formatter.get_formatted(doc) + + if len(text) < 2 and '<' not in entry['summary']: + text = entry['summary'] + + if len(text) > len(version.text): + version.text = text + + if len(version.text) < 2: # less than 2 chars is likely failure + raise Exception("failed to parse {}\n{}".format(entry,version)) + + # search if the previous version was the same + ident_version = data.Version.select().where( + (data.Version.item == version.item) & + (data.Version.title == version.title) & + (data.Version.authors == version.authors) & + (data.Version.text == version.text)).first() + + if ident_version: + logging.info("No change, skip %s", item.uid) + else: + version.save() + logging.info("Saved new version of %s: %s", item.uid, version.id) + except Exception as e: + logging.exception("Failed to process %s", entry['link']) + + +if __name__ == "__main__": + db = data.SqliteDatabase(os.path.join( + os.path.dirname(__file__), "db.sqlite")) + data.db.initialize(db) + + data.db.connect() + data.db.create_tables(filter(lambda t: not t.table_exists(), + data.BaseModel.__subclasses__())) + + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('--add', metavar='URL', + help='add feed with URL') + parser.add_argument('--update', action='store_true', + help='Update all known feeds') + parser.add_argument('--update-feed', type=int, metavar='ID', + help='Update feed with ID') + parser.add_argument('--remove', type=int, metavar='ID', + help='Remove feed with ID') + parser.add_argument('--list', action='store_true', + help='List all registered feeds') + parser.add_argument('--web', action='store_true', + help='Run web interface') + + args = parser.parse_args() + if args.update: + for feed in data.Feed.select(): + try: + update(feed) + except Exception as e: + logging.exception("Failed to update %s", feed.url) + if args.update_feed: + feed = data.Feed.get(data.Feed.id == args.update_feed) + if feed: + update(feed) + if args.add: + feed = data.Feed.create(url=args.add) + print("Added {}".format(feed)) + if args.remove: + feed = data.Feed.get(data.Feed.id == args.remove) + feed.delete_instance() + print("Removed {}".format(feed)) + if args.list: + for feed in data.Feed.select().order_by(data.Feed.created_date): + print("{0.id} - {1} - {0.title} - {0.url}".format( + feed, feed.created_date.strftime('%x %X'))) + if args.web: + web.run() -- cgit v1.2.1