summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--README.md29
-rw-r--r--data.py41
-rw-r--r--requirements.txt5
-rwxr-xr-xwatchnews.py137
-rw-r--r--web.py130
6 files changed, 344 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9d458c7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+db.sqlite
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1d93fd9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,29 @@
+# Watchnews
+
+Aggregate RSS and archive website for later diffing.
+
+
+
+# USAGE
+
+ usage: watchnews.py [-h] [--add URL] [--update] [--update-feed ID]
+ [--remove ID] [--list] [--web]
+
+ Process some integers.
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --add URL add feed with URL
+ --update Update all known feeds
+ --update-feed ID Update feed with ID
+ --remove ID Remove feed with ID
+ --list List all registered feeds
+ --web Run web interface
+
+
+## Example
+
+ ./watchnews.py --add 'http://blog.fefe.de/rss.xml?html'
+ ./watchnews.py --update
+ ./watchnews.py --web
+
diff --git a/data.py b/data.py
new file mode 100644
index 0000000..c1d2e82
--- /dev/null
+++ b/data.py
@@ -0,0 +1,41 @@
+import os
+import datetime
+import peewee as p
+import re
+
+class SqliteDatabase(p.SqliteDatabase):
+ foreign_keys = True
+ def _add_conn_hooks(self, conn):
+ p.SqliteDatabase._add_conn_hooks(self, conn)
+ conn.execute('PRAGMA foreign_keys = ON;').fetchone()
+
+db = p.Proxy()
+
+class BaseModel(p.Model):
+ class Meta:
+ database = db
+ def __json__(self):
+ return self._data
+ def __repr__(self):
+ return "<{.__class__.__name__} {}>".format(self, " ".join(
+ map(lambda kv: "{0[0]}={0[1]}".format(kv), self._data.items())))
+
+class Feed(BaseModel):
+ created_date = p.DateTimeField(default=datetime.datetime.now)
+ url = p.CharField(unique=True)
+ title = p.CharField(null=True)
+
+class Item(BaseModel):
+ created_date = p.DateTimeField(default=datetime.datetime.now)
+ feed = p.ForeignKeyField(Feed, related_name='items', on_delete='CASCADE')
+ uid = p.CharField()
+ title = p.CharField(null=True)
+
+class Version(BaseModel):
+ created_date = p.DateTimeField(default=datetime.datetime.now)
+ item = p.ForeignKeyField(Item, related_name='versions', on_delete='CASCADE')
+ url = p.CharField()
+ title = p.CharField()
+ authors = p.CharField()
+ text = p.CharField()
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..35b9fc2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+peewee==2.7.1
+feedparser>=5.1.0
+newspaper3k==0.1.5
+Flask==0.10.1
+ll-xist==5.13
diff --git a/watchnews.py b/watchnews.py
new file mode 100755
index 0000000..e22b483
--- /dev/null
+++ b/watchnews.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+import data
+import web
+
+import feedparser
+from newspaper import Article
+from newspaper.outputformatters import OutputFormatter
+from newspaper.cleaners import DocumentCleaner
+
+import os
+import argparse
+import logging
+
+
+logging.basicConfig(level=logging.INFO)
+
+def update(feed):
+ logging.info("Update %s", feed.url)
+ result = feedparser.parse(feed.url)
+ if result['feed']['title'] != feed.title:
+ feed.title = result['feed']['title']
+ feed.save()
+
+ for entry in result['entries']:
+ try:
+ url = entry['link']
+ if 'id' in entry:
+ uid = entry['id']
+ else:
+ uid = entry['link']
+
+ item = data.Item.select() \
+ .where((data.Item.uid == uid) & (data.Item.feed == feed)) \
+ .first()
+ if not item:
+ item = data.Item.create(feed=feed, uid=uid)
+
+
+ paper = Article(entry['link'])
+ paper.download()
+ paper.parse()
+
+ if item.title != paper.title:
+ item.title = paper.title
+ item.save()
+
+ version = data.Version()
+ version.item = item
+ version.uid = uid
+ version.url = entry['link']
+ authors = paper.authors
+ authors.sort()
+ version.authors = ", ".join(authors)
+ version.title = paper.title
+ version.text = paper.text
+
+ # alternative, try if the rss body got a bigger text
+ document_cleaner = DocumentCleaner(paper.config)
+ output_formatter = OutputFormatter(paper.config)
+ # override this method since it doesn't work on summaries
+ output_formatter.links_to_text = lambda: None
+ doc = paper.config.get_parser().fromstring(entry['summary'])
+ doc = document_cleaner.clean(doc)
+ text, article_html = output_formatter.get_formatted(doc)
+
+ if len(text) < 2 and '<' not in entry['summary']:
+ text = entry['summary']
+
+ if len(text) > len(version.text):
+ version.text = text
+
+ if len(version.text) < 2: # less than 2 chars is likely failure
+ raise Exception("failed to parse {}\n{}".format(entry,version))
+
+ # search if the previous version was the same
+ ident_version = data.Version.select().where(
+ (data.Version.item == version.item) &
+ (data.Version.title == version.title) &
+ (data.Version.authors == version.authors) &
+ (data.Version.text == version.text)).first()
+
+ if ident_version:
+ logging.info("No change, skip %s", item.uid)
+ else:
+ version.save()
+ logging.info("Saved new version of %s: %s", item.uid, version.id)
+ except Exception as e:
+ logging.exception("Failed to process %s", entry['link'])
+
+
+if __name__ == "__main__":
+ db = data.SqliteDatabase(os.path.join(
+ os.path.dirname(__file__), "db.sqlite"))
+ data.db.initialize(db)
+
+ data.db.connect()
+ data.db.create_tables(filter(lambda t: not t.table_exists(),
+ data.BaseModel.__subclasses__()))
+
+ parser = argparse.ArgumentParser(description='Process some integers.')
+ parser.add_argument('--add', metavar='URL',
+ help='add feed with URL')
+ parser.add_argument('--update', action='store_true',
+ help='Update all known feeds')
+ parser.add_argument('--update-feed', type=int, metavar='ID',
+ help='Update feed with ID')
+ parser.add_argument('--remove', type=int, metavar='ID',
+ help='Remove feed with ID')
+ parser.add_argument('--list', action='store_true',
+ help='List all registered feeds')
+ parser.add_argument('--web', action='store_true',
+ help='Run web interface')
+
+ args = parser.parse_args()
+ if args.update:
+ for feed in data.Feed.select():
+ try:
+ update(feed)
+ except Exception as e:
+ logging.exception("Failed to update %s", feed.url)
+ if args.update_feed:
+ feed = data.Feed.get(data.Feed.id == args.update_feed)
+ if feed:
+ update(feed)
+ if args.add:
+ feed = data.Feed.create(url=args.add)
+ print("Added {}".format(feed))
+ if args.remove:
+ feed = data.Feed.get(data.Feed.id == args.remove)
+ feed.delete_instance()
+ print("Removed {}".format(feed))
+ if args.list:
+ for feed in data.Feed.select().order_by(data.Feed.created_date):
+ print("{0.id} - {1} - {0.title} - {0.url}".format(
+ feed, feed.created_date.strftime('%x %X')))
+ if args.web:
+ web.run()
diff --git a/web.py b/web.py
new file mode 100644
index 0000000..35f2e89
--- /dev/null
+++ b/web.py
@@ -0,0 +1,130 @@
+import data
+
+from ll.xist import xsc, parse
+from ll.xist.ns import html, xml, meta
+from flask import Flask, Response, url_for
+
+import difflib
+
+
+class View:
+
+ @staticmethod
+ def template(title, body):
+ return xsc.Frag(
+ html.DocTypeXHTML10transitional(),
+ "\n",
+ html.html(
+ html.head(
+ html.meta(charset='utf-8'),
+ html.title(title),
+ html.style("""\
+.diff_add {
+color: green;
+}
+
+table.diff {font-family:Courier; border:medium;}
+.diff_header {background-color:#e0e0e0}
+td.diff_header {text-align:right}
+.diff_next {background-color:#c0c0c0}
+.diff_sub {background-color:#ffaaaa}
+
+tr td:nth-child(3) .diff_chg {
+ color: blue;
+}
+tr td:nth-child(6) .diff_chg {
+ color: orangered;
+}""")
+ ),
+ html.body(body)))
+
+ @staticmethod
+ def index(feeds):
+ return View.template("Index", html.ul(
+ list(map(lambda feed:
+ html.li(
+ html.a(feed.title or feed.url, href=url_for('feed', id=feed.id))),
+ feeds))))
+
+ @staticmethod
+ def feed(feed, items):
+ return View.template("Feed {}".format(feed.title or feed.url),
+ html.div(
+ html.a("Back", href=url_for('index')),
+ html.h1(feed.title),
+ *list(map(lambda item: [
+ html.h2(html.a(item.title, href=url_for('item', id=item.id))),
+ html.ul(
+ *map(lambda version:
+ html.li("{} {}".format(version.created_date.strftime("%x %X"),
+ version.title)),
+ item.versions)
+ )
+ ], items))))
+
+ @staticmethod
+ def format_version(a, b):
+ temp = """\
+Title: {0.title}
+Authors: {0.authors}
+Url: {0.url}
+Text:
+{0.text}"""
+ if a == None:
+ adata = []
+ fromdate = ""
+ else:
+ adata = temp.format(a).split("\n")
+ fromdate = a.created_date.strftime("%x %X")
+ todate = b.created_date.strftime("%x %X")
+
+ bdata = temp.format(b).split("\n")
+ table = difflib.HtmlDiff(wrapcolumn=60) \
+ .make_table(adata, bdata,
+ fromdesc=fromdate, todesc=todate)
+
+ table = table.encode('utf-8')
+ node = parse.tree(table, parse.Expat(), parse.NS(
+ html), parse.Node(pool=xsc.Pool(html)))
+ return html.div(node)
+
+ @staticmethod
+ def item(item, versions):
+ versionsA = versions
+ versionsB = [None] + versions[:-1]
+ versions = list(zip(versionsA, versionsB))
+ return View.template("Item: {}".format(item.title),
+ html.div(
+ html.a("Back to {}".format(item.feed.title),
+ href=url_for('feed', id=item.feed.id)),
+ html.h1(item.title),
+ *list(map(lambda versionAB:
+ View.format_version(versionAB[1], versionAB[0]),
+ versions))
+ ))
+
+
+def run():
+ app = Flask(__name__)
+
+ @app.route('/')
+ def index():
+ return View.index(data.Feed.select()).string("utf-8")
+
+ @app.route('/feed/<id>')
+ def feed(id):
+ feed = data.Feed.get(data.Feed.id == id)
+ items = data.Item.select() \
+ .where(data.Item.feed == feed) \
+ .order_by(data.Item.created_date.desc())
+ return View.feed(feed, items).string("utf-8")
+
+ @app.route('/item/<id>')
+ def item(id):
+ item = data.Item.get(data.Item.id == id)
+ versions = data.Version.select() \
+ .where(data.Version.item == item) \
+ .order_by(data.Version.created_date)
+ return View.item(item, list(versions)).string("utf-8")
+
+ app.run()