diff options
Diffstat (limited to 'watchnews')
-rw-r--r-- | watchnews/__init__.py | 1 | ||||
-rw-r--r-- | watchnews/css.py | 29 | ||||
-rw-r--r-- | watchnews/data.py | 50 | ||||
-rwxr-xr-x | watchnews/fetch.py | 83 | ||||
-rw-r--r-- | watchnews/web.py | 207 | ||||
-rw-r--r-- | watchnews/wsgi.py | 11 |
6 files changed, 381 insertions, 0 deletions
diff --git a/watchnews/__init__.py b/watchnews/__init__.py new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/watchnews/__init__.py @@ -0,0 +1 @@ +# empty diff --git a/watchnews/css.py b/watchnews/css.py new file mode 100644 index 0000000..b32a274 --- /dev/null +++ b/watchnews/css.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +import re + + +class Rule: + + def __init__(self, *path, **properties): + self.path = path + self.properties = properties + + def _f(self, prop): + key, value = prop + key = re.sub("([A-Z])", "-\\1", key).lower() + return " {}: {};".format(key, value) + + def __format__(self): + result = " ".join(self.path) + result += " {\n" + result += "\n".join(map(self._f, self.properties.items())) + result += "\n}\n" + return result + + +def string(*rules): + return "\n".join(map(lambda r: r.__format__(), rules)) + +if __name__ == "__main__": + print("CSS Demo") + print(Rule(".foo", "#blah", backgroundColor="red").__format__()) diff --git a/watchnews/data.py b/watchnews/data.py new file mode 100644 index 0000000..e4ecc24 --- /dev/null +++ b/watchnews/data.py @@ -0,0 +1,50 @@ +import peewee as p + +import os +import datetime +import re + +db = p.Proxy() + +class SqliteDatabase(p.SqliteDatabase): + foreign_keys = True + def _add_conn_hooks(self, conn): + p.SqliteDatabase._add_conn_hooks(self, conn) + conn.execute('PRAGMA foreign_keys = ON;').fetchone() + +def init_sqlite(path): + db.initialize(SqliteDatabase(path)) + + db.connect() + db.create_tables(filter(lambda t: not t.table_exists(), + BaseModel.__subclasses__())) + + +class BaseModel(p.Model): + class Meta: + database = db + def __json__(self): + return self._data + def __repr__(self): + return "<{.__class__.__name__} {}>".format(self, " ".join( + map(lambda kv: "{0[0]}={0[1]}".format(kv), self._data.items()))) + +class Feed(BaseModel): + created_date = p.DateTimeField(default=datetime.datetime.now) + url = p.CharField(unique=True) + title = p.CharField(null=True) + +class Item(BaseModel): + created_date = p.DateTimeField(default=datetime.datetime.now) + feed = p.ForeignKeyField(Feed, related_name='items', on_delete='CASCADE') + uid = p.CharField() + title = p.CharField(null=True) + +class Version(BaseModel): + created_date = p.DateTimeField(default=datetime.datetime.now) + item = p.ForeignKeyField(Item, related_name='versions', on_delete='CASCADE') + url = p.CharField() + title = p.CharField() + authors = p.CharField() + text = p.CharField() + diff --git a/watchnews/fetch.py b/watchnews/fetch.py new file mode 100755 index 0000000..5201c1e --- /dev/null +++ b/watchnews/fetch.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +from . import data + +import feedparser +from newspaper import Article +from newspaper.outputformatters import OutputFormatter +from newspaper.cleaners import DocumentCleaner + +import logging + +def update(feed): + logging.info("Update %s", feed.url) + result = feedparser.parse(feed.url) + if result['feed']['title'] != feed.title: + feed.title = result['feed']['title'] + feed.save() + + for entry in result['entries']: + try: + url = entry['link'] + if 'id' in entry: + uid = entry['id'] + else: + uid = entry['link'] + + item = data.Item.select() \ + .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ + .first() + if not item: + item = data.Item.create(feed=feed, uid=uid) + + + paper = Article(entry['link']) + paper.download() + paper.parse() + + if item.title != paper.title: + item.title = paper.title + item.save() + + version = data.Version() + version.item = item + version.uid = uid + version.url = entry['link'] + authors = paper.authors + authors.sort() + version.authors = ", ".join(authors) + version.title = paper.title + version.text = paper.text + + # alternative, try if the rss body got a bigger text + document_cleaner = DocumentCleaner(paper.config) + output_formatter = OutputFormatter(paper.config) + # override this method since it doesn't work on summaries + output_formatter.links_to_text = lambda: None + doc = paper.config.get_parser().fromstring(entry['summary']) + doc = document_cleaner.clean(doc) + text, article_html = output_formatter.get_formatted(doc) + + if len(text) < 2 and '<' not in entry['summary']: + text = entry['summary'] + + if len(text) > len(version.text): + version.text = text + + if len(version.text) < 2: # less than 2 chars is likely failure + raise Exception("failed to parse {}\n{}".format(entry,version)) + + # search if the previous version was the same + ident_version = data.Version.select().where( + (data.Version.item == version.item) & + (data.Version.title == version.title) & + (data.Version.authors == version.authors) & + (data.Version.text == version.text)).first() + + if ident_version: + logging.info("No change, skip %s", item.uid) + else: + version.save() + logging.info("Saved new version of %s: %s", item.uid, version.id) + except Exception as e: + logging.exception("Failed to process %s", entry['link']) + diff --git a/watchnews/web.py b/watchnews/web.py new file mode 100644 index 0000000..438e649 --- /dev/null +++ b/watchnews/web.py @@ -0,0 +1,207 @@ +from . import data, css + +from ll.xist import xsc, parse +from ll.xist.ns import html +from flask import Flask, url_for + +import re +import difflib + + +class DiffSupport: + + def _diff(self, line1, line2): + diff = list(difflib._mdiff([line1], [line2])) + return (self._format_diff(diff[0][0][1]), + self._format_diff(diff[0][1][1])) + + def _diff_lines(self, lines1, lines2): + diff = difflib._mdiff(lines1, lines2) + rows = [] + for ((line1, diff1), (line2, diff2), flag) in diff: + rows.append(html.tr( + html.th(), + html.td(line1), + html.td(*self._format_diff(diff1)), + html.td(line2), + html.td(*self._format_diff(diff2)) + )) + return html.table(rows, **{'class': 'textdiff'}) + + def _format_diff(self, line): + elems = [] + nextpos = line.find("\x00") + while nextpos != -1 and nextpos + 1 < len(line): + actionclass = { + '+': 'diff_add', '-': 'diff_sub', + '^': 'diff_chg'}[line[nextpos + 1]] + endpos = line.find("\x01", nextpos + 2) + + if nextpos != 0: # intermediate unchanged text + elems += [html.span(line[:nextpos])] + + text = line[nextpos + 2:endpos] + elems += [html.span(text, **{'class': actionclass})] + + line = line[endpos:] + nextpos = line.find("\x00") + + if line != "": # trailing unchanged text + elems += [html.span(line)] + return elems + + +class Difftable(html.div, DiffSupport): + + def __init__(self, to_version, from_version=None): + super().__init__() + if from_version == None: + self.single_version(to_version) + else: + self.two_versions(to_version, from_version) + + def single_version(self, version): + self.append(html.table( + html.tr(html.th("Title"), html.td(version.title)), + html.tr(html.th("Date"), html.td( + version.created_date.strftime("%x %X"))), + html.tr(html.th("Link"), html.td( + html.a(version.url, href=version.url))), + html.tr(html.th("Text", colspan=2)), + html.tr(html.td(map(html.p, version.text.split("\n")), colspan=2)) + )) + + def two_versions(self, to_version, from_version): + def prepare_text(text): + return re.sub("\n\n\n*", "\n\n", text).split("\n") + from_text = prepare_text(from_version.text) + to_text = prepare_text(to_version.text) + + from_difftitle, to_difftitle = self._diff( + from_version.title, to_version.title) + + diff = difflib._mdiff(from_text, to_text) + self.append(html.table( + html.tr(html.th("Title"), + html.td(from_difftitle), + html.td(to_difftitle), **{'class': 'textdiff'}), + html.tr(html.th("Date"), + html.td(from_version.created_date.strftime("%x %X")), + html.td(to_version.created_date.strftime("%x %X"))), + html.tr(html.th("Link"), + html.td(html.a(from_version.url, href=from_version.url)), + html.td(html.a(to_version.url, href=to_version.url))), + html.tr(html.th("Text", colspan=3)), + html.tr(html.td(self._diff_lines(from_text, to_text), colspan=3)), + **{'class': "versiondiff"} + )) + + +class ItemWidget(html.div, DiffSupport): + + def __init__(self, item): + super().__init__() + self.append(html.h2(html.a(item.title, + href=url_for('item', id=item.id)))) + versionsFrom = [None] + list(item.versions)[:-1] + versionsTo = item.versions + versions = list(zip(versionsFrom, versionsTo)) + + self.append(html.ul( + *map(self.version, versions))) + + def version(self, versions): + from_version, to_version = versions + if from_version == None: + title = html.span(to_version.title) + else: + from_difftitle, to_difftitle = self._diff( + from_version.title, to_version.title) + title = html.span(to_difftitle, **{'class': 'textdiff'}) + return html.li(html.span(to_version.created_date.strftime("%x %X")), + html.span(" - "), + title) + + +class Template: + + @staticmethod + def template(title, body): + return xsc.Frag( + html.DocTypeXHTML10transitional(), "\n", + html.html( + html.head( + html.meta(charset='utf-8'), + html.title(title), + html.style(css.string( + css.Rule(".textdiff", ".diff_add", + backgroundColor="#CEF6CE"), + css.Rule(".textdiff", ".diff_next", + backgroundColor="#c0c0c0"), + css.Rule(".textdiff", ".diff_sub", + backgroundColor="#FFDADA"), + css.Rule(".textdiff", ".diff_chg", + backgroundColor="#E5E1FF") + )) + ), "\n", + html.body(body))) + + @staticmethod + def index(feeds): + return Template.template( + "Index", + html.ul( + *map(lambda feed: html.li( + html.a(feed.title or feed.url, href=url_for('feed', id=feed.id))), + feeds))) + + @staticmethod + def feed(feed, items): + return Template.template( + "Feed {}".format(feed.title or feed.url), + html.div( + html.a("Back", href=url_for('index')), + html.h1(feed.title), + *map(ItemWidget, items))) + + @staticmethod + def item(item, versions): + versionsA = versions + versionsB = [None] + versions[:-1] + versions = list(zip(versionsA, versionsB)) + return Template.template("Item: {}".format(item.title), + html.div( + html.a("Back to {}".format(item.feed.title), + href=url_for('feed', id=item.feed.id)), + html.h1(item.title), + *map(lambda versionAB: html.div( + Difftable(versionAB[0], versionAB[1]), + html.hr()), + versions))) + + +def get_app(): + app = Flask(__name__) + + @app.route('/') + def index(): + return Template.index(data.Feed.select()).string("utf-8") + + @app.route('/feed/<id>') + def feed(id): + feed = data.Feed.get(data.Feed.id == id) + items = data.Item.select() \ + .where(data.Item.feed == feed) \ + .order_by(data.Item.created_date.desc()) + return Template.feed(feed, items).string("utf-8") + + @app.route('/item/<id>') + def item(id): + item = data.Item.get(data.Item.id == id) + versions = data.Version.select() \ + .where(data.Version.item == item) \ + .order_by(data.Version.created_date) + return Template.item(item, list(versions)).string("utf-8") + + return app + diff --git a/watchnews/wsgi.py b/watchnews/wsgi.py new file mode 100644 index 0000000..bd157cb --- /dev/null +++ b/watchnews/wsgi.py @@ -0,0 +1,11 @@ +from watchnews import web, data +import os + + +dbpath = os.environ.get("WATCHNEWS_DBPATH") +if not dbpath: + raise Exception("WATCHNEWS_DBPATH must be set") + +data.init_sqlite(dbpath) +application = web.get_app() + |