summaryrefslogtreecommitdiff
path: root/watchnews.py
diff options
context:
space:
mode:
Diffstat (limited to 'watchnews.py')
-rwxr-xr-xwatchnews.py137
1 files changed, 137 insertions, 0 deletions
diff --git a/watchnews.py b/watchnews.py
new file mode 100755
index 0000000..e22b483
--- /dev/null
+++ b/watchnews.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+import data
+import web
+
+import feedparser
+from newspaper import Article
+from newspaper.outputformatters import OutputFormatter
+from newspaper.cleaners import DocumentCleaner
+
+import os
+import argparse
+import logging
+
+
+logging.basicConfig(level=logging.INFO)
+
+def update(feed):
+ logging.info("Update %s", feed.url)
+ result = feedparser.parse(feed.url)
+ if result['feed']['title'] != feed.title:
+ feed.title = result['feed']['title']
+ feed.save()
+
+ for entry in result['entries']:
+ try:
+ url = entry['link']
+ if 'id' in entry:
+ uid = entry['id']
+ else:
+ uid = entry['link']
+
+ item = data.Item.select() \
+ .where((data.Item.uid == uid) & (data.Item.feed == feed)) \
+ .first()
+ if not item:
+ item = data.Item.create(feed=feed, uid=uid)
+
+
+ paper = Article(entry['link'])
+ paper.download()
+ paper.parse()
+
+ if item.title != paper.title:
+ item.title = paper.title
+ item.save()
+
+ version = data.Version()
+ version.item = item
+ version.uid = uid
+ version.url = entry['link']
+ authors = paper.authors
+ authors.sort()
+ version.authors = ", ".join(authors)
+ version.title = paper.title
+ version.text = paper.text
+
+ # alternative, try if the rss body got a bigger text
+ document_cleaner = DocumentCleaner(paper.config)
+ output_formatter = OutputFormatter(paper.config)
+ # override this method since it doesn't work on summaries
+ output_formatter.links_to_text = lambda: None
+ doc = paper.config.get_parser().fromstring(entry['summary'])
+ doc = document_cleaner.clean(doc)
+ text, article_html = output_formatter.get_formatted(doc)
+
+ if len(text) < 2 and '<' not in entry['summary']:
+ text = entry['summary']
+
+ if len(text) > len(version.text):
+ version.text = text
+
+ if len(version.text) < 2: # less than 2 chars is likely failure
+ raise Exception("failed to parse {}\n{}".format(entry,version))
+
+ # search if the previous version was the same
+ ident_version = data.Version.select().where(
+ (data.Version.item == version.item) &
+ (data.Version.title == version.title) &
+ (data.Version.authors == version.authors) &
+ (data.Version.text == version.text)).first()
+
+ if ident_version:
+ logging.info("No change, skip %s", item.uid)
+ else:
+ version.save()
+ logging.info("Saved new version of %s: %s", item.uid, version.id)
+ except Exception as e:
+ logging.exception("Failed to process %s", entry['link'])
+
+
+if __name__ == "__main__":
+ db = data.SqliteDatabase(os.path.join(
+ os.path.dirname(__file__), "db.sqlite"))
+ data.db.initialize(db)
+
+ data.db.connect()
+ data.db.create_tables(filter(lambda t: not t.table_exists(),
+ data.BaseModel.__subclasses__()))
+
+ parser = argparse.ArgumentParser(description='Process some integers.')
+ parser.add_argument('--add', metavar='URL',
+ help='add feed with URL')
+ parser.add_argument('--update', action='store_true',
+ help='Update all known feeds')
+ parser.add_argument('--update-feed', type=int, metavar='ID',
+ help='Update feed with ID')
+ parser.add_argument('--remove', type=int, metavar='ID',
+ help='Remove feed with ID')
+ parser.add_argument('--list', action='store_true',
+ help='List all registered feeds')
+ parser.add_argument('--web', action='store_true',
+ help='Run web interface')
+
+ args = parser.parse_args()
+ if args.update:
+ for feed in data.Feed.select():
+ try:
+ update(feed)
+ except Exception as e:
+ logging.exception("Failed to update %s", feed.url)
+ if args.update_feed:
+ feed = data.Feed.get(data.Feed.id == args.update_feed)
+ if feed:
+ update(feed)
+ if args.add:
+ feed = data.Feed.create(url=args.add)
+ print("Added {}".format(feed))
+ if args.remove:
+ feed = data.Feed.get(data.Feed.id == args.remove)
+ feed.delete_instance()
+ print("Removed {}".format(feed))
+ if args.list:
+ for feed in data.Feed.select().order_by(data.Feed.created_date):
+ print("{0.id} - {1} - {0.title} - {0.url}".format(
+ feed, feed.created_date.strftime('%x %X')))
+ if args.web:
+ web.run()