diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | README.md | 29 | ||||
-rw-r--r-- | data.py | 41 | ||||
-rw-r--r-- | requirements.txt | 5 | ||||
-rwxr-xr-x | watchnews.py | 137 | ||||
-rw-r--r-- | web.py | 130 |
6 files changed, 344 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d458c7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +db.sqlite diff --git a/README.md b/README.md new file mode 100644 index 0000000..1d93fd9 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Watchnews + +Aggregate RSS and archive website for later diffing. + + + +# USAGE + + usage: watchnews.py [-h] [--add URL] [--update] [--update-feed ID] + [--remove ID] [--list] [--web] + + Process some integers. + + optional arguments: + -h, --help show this help message and exit + --add URL add feed with URL + --update Update all known feeds + --update-feed ID Update feed with ID + --remove ID Remove feed with ID + --list List all registered feeds + --web Run web interface + + +## Example + + ./watchnews.py --add 'http://blog.fefe.de/rss.xml?html' + ./watchnews.py --update + ./watchnews.py --web + @@ -0,0 +1,41 @@ +import os +import datetime +import peewee as p +import re + +class SqliteDatabase(p.SqliteDatabase): + foreign_keys = True + def _add_conn_hooks(self, conn): + p.SqliteDatabase._add_conn_hooks(self, conn) + conn.execute('PRAGMA foreign_keys = ON;').fetchone() + +db = p.Proxy() + +class BaseModel(p.Model): + class Meta: + database = db + def __json__(self): + return self._data + def __repr__(self): + return "<{.__class__.__name__} {}>".format(self, " ".join( + map(lambda kv: "{0[0]}={0[1]}".format(kv), self._data.items()))) + +class Feed(BaseModel): + created_date = p.DateTimeField(default=datetime.datetime.now) + url = p.CharField(unique=True) + title = p.CharField(null=True) + +class Item(BaseModel): + created_date = p.DateTimeField(default=datetime.datetime.now) + feed = p.ForeignKeyField(Feed, related_name='items', on_delete='CASCADE') + uid = p.CharField() + title = p.CharField(null=True) + +class Version(BaseModel): + created_date = p.DateTimeField(default=datetime.datetime.now) + item = p.ForeignKeyField(Item, related_name='versions', on_delete='CASCADE') + url = p.CharField() + title = p.CharField() + authors = p.CharField() + text = p.CharField() + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..35b9fc2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +peewee==2.7.1 +feedparser>=5.1.0 +newspaper3k==0.1.5 +Flask==0.10.1 +ll-xist==5.13 diff --git a/watchnews.py b/watchnews.py new file mode 100755 index 0000000..e22b483 --- /dev/null +++ b/watchnews.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +import data +import web + +import feedparser +from newspaper import Article +from newspaper.outputformatters import OutputFormatter +from newspaper.cleaners import DocumentCleaner + +import os +import argparse +import logging + + +logging.basicConfig(level=logging.INFO) + +def update(feed): + logging.info("Update %s", feed.url) + result = feedparser.parse(feed.url) + if result['feed']['title'] != feed.title: + feed.title = result['feed']['title'] + feed.save() + + for entry in result['entries']: + try: + url = entry['link'] + if 'id' in entry: + uid = entry['id'] + else: + uid = entry['link'] + + item = data.Item.select() \ + .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ + .first() + if not item: + item = data.Item.create(feed=feed, uid=uid) + + + paper = Article(entry['link']) + paper.download() + paper.parse() + + if item.title != paper.title: + item.title = paper.title + item.save() + + version = data.Version() + version.item = item + version.uid = uid + version.url = entry['link'] + authors = paper.authors + authors.sort() + version.authors = ", ".join(authors) + version.title = paper.title + version.text = paper.text + + # alternative, try if the rss body got a bigger text + document_cleaner = DocumentCleaner(paper.config) + output_formatter = OutputFormatter(paper.config) + # override this method since it doesn't work on summaries + output_formatter.links_to_text = lambda: None + doc = paper.config.get_parser().fromstring(entry['summary']) + doc = document_cleaner.clean(doc) + text, article_html = output_formatter.get_formatted(doc) + + if len(text) < 2 and '<' not in entry['summary']: + text = entry['summary'] + + if len(text) > len(version.text): + version.text = text + + if len(version.text) < 2: # less than 2 chars is likely failure + raise Exception("failed to parse {}\n{}".format(entry,version)) + + # search if the previous version was the same + ident_version = data.Version.select().where( + (data.Version.item == version.item) & + (data.Version.title == version.title) & + (data.Version.authors == version.authors) & + (data.Version.text == version.text)).first() + + if ident_version: + logging.info("No change, skip %s", item.uid) + else: + version.save() + logging.info("Saved new version of %s: %s", item.uid, version.id) + except Exception as e: + logging.exception("Failed to process %s", entry['link']) + + +if __name__ == "__main__": + db = data.SqliteDatabase(os.path.join( + os.path.dirname(__file__), "db.sqlite")) + data.db.initialize(db) + + data.db.connect() + data.db.create_tables(filter(lambda t: not t.table_exists(), + data.BaseModel.__subclasses__())) + + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('--add', metavar='URL', + help='add feed with URL') + parser.add_argument('--update', action='store_true', + help='Update all known feeds') + parser.add_argument('--update-feed', type=int, metavar='ID', + help='Update feed with ID') + parser.add_argument('--remove', type=int, metavar='ID', + help='Remove feed with ID') + parser.add_argument('--list', action='store_true', + help='List all registered feeds') + parser.add_argument('--web', action='store_true', + help='Run web interface') + + args = parser.parse_args() + if args.update: + for feed in data.Feed.select(): + try: + update(feed) + except Exception as e: + logging.exception("Failed to update %s", feed.url) + if args.update_feed: + feed = data.Feed.get(data.Feed.id == args.update_feed) + if feed: + update(feed) + if args.add: + feed = data.Feed.create(url=args.add) + print("Added {}".format(feed)) + if args.remove: + feed = data.Feed.get(data.Feed.id == args.remove) + feed.delete_instance() + print("Removed {}".format(feed)) + if args.list: + for feed in data.Feed.select().order_by(data.Feed.created_date): + print("{0.id} - {1} - {0.title} - {0.url}".format( + feed, feed.created_date.strftime('%x %X'))) + if args.web: + web.run() @@ -0,0 +1,130 @@ +import data + +from ll.xist import xsc, parse +from ll.xist.ns import html, xml, meta +from flask import Flask, Response, url_for + +import difflib + + +class View: + + @staticmethod + def template(title, body): + return xsc.Frag( + html.DocTypeXHTML10transitional(), + "\n", + html.html( + html.head( + html.meta(charset='utf-8'), + html.title(title), + html.style("""\ +.diff_add { +color: green; +} + +table.diff {font-family:Courier; border:medium;} +.diff_header {background-color:#e0e0e0} +td.diff_header {text-align:right} +.diff_next {background-color:#c0c0c0} +.diff_sub {background-color:#ffaaaa} + +tr td:nth-child(3) .diff_chg { + color: blue; +} +tr td:nth-child(6) .diff_chg { + color: orangered; +}""") + ), + html.body(body))) + + @staticmethod + def index(feeds): + return View.template("Index", html.ul( + list(map(lambda feed: + html.li( + html.a(feed.title or feed.url, href=url_for('feed', id=feed.id))), + feeds)))) + + @staticmethod + def feed(feed, items): + return View.template("Feed {}".format(feed.title or feed.url), + html.div( + html.a("Back", href=url_for('index')), + html.h1(feed.title), + *list(map(lambda item: [ + html.h2(html.a(item.title, href=url_for('item', id=item.id))), + html.ul( + *map(lambda version: + html.li("{} {}".format(version.created_date.strftime("%x %X"), + version.title)), + item.versions) + ) + ], items)))) + + @staticmethod + def format_version(a, b): + temp = """\ +Title: {0.title} +Authors: {0.authors} +Url: {0.url} +Text: +{0.text}""" + if a == None: + adata = [] + fromdate = "" + else: + adata = temp.format(a).split("\n") + fromdate = a.created_date.strftime("%x %X") + todate = b.created_date.strftime("%x %X") + + bdata = temp.format(b).split("\n") + table = difflib.HtmlDiff(wrapcolumn=60) \ + .make_table(adata, bdata, + fromdesc=fromdate, todesc=todate) + + table = table.encode('utf-8') + node = parse.tree(table, parse.Expat(), parse.NS( + html), parse.Node(pool=xsc.Pool(html))) + return html.div(node) + + @staticmethod + def item(item, versions): + versionsA = versions + versionsB = [None] + versions[:-1] + versions = list(zip(versionsA, versionsB)) + return View.template("Item: {}".format(item.title), + html.div( + html.a("Back to {}".format(item.feed.title), + href=url_for('feed', id=item.feed.id)), + html.h1(item.title), + *list(map(lambda versionAB: + View.format_version(versionAB[1], versionAB[0]), + versions)) + )) + + +def run(): + app = Flask(__name__) + + @app.route('/') + def index(): + return View.index(data.Feed.select()).string("utf-8") + + @app.route('/feed/<id>') + def feed(id): + feed = data.Feed.get(data.Feed.id == id) + items = data.Item.select() \ + .where(data.Item.feed == feed) \ + .order_by(data.Item.created_date.desc()) + return View.feed(feed, items).string("utf-8") + + @app.route('/item/<id>') + def item(id): + item = data.Item.get(data.Item.id == id) + versions = data.Version.select() \ + .where(data.Version.item == item) \ + .order_by(data.Version.created_date) + return View.item(item, list(versions)).string("utf-8") + + app.run() |