summaryrefslogtreecommitdiff
path: root/watchnews
diff options
context:
space:
mode:
authorYves Fischer <yvesf-git@xapek.org>2015-11-22 00:25:56 +0100
committerYves Fischer <yvesf-git@xapek.org>2015-11-22 00:25:56 +0100
commit6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0 (patch)
tree07b69c3c6a665ef290591e184ca0d2ba3b73fe01 /watchnews
parent49ac3c20cb77b90493ce79b4e31cf0f58cba0116 (diff)
downloadwatchnews-6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0.tar.gz
watchnews-6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0.zip
project restructure
make it a proper python package
Diffstat (limited to 'watchnews')
-rw-r--r--watchnews/__init__.py1
-rw-r--r--watchnews/css.py29
-rw-r--r--watchnews/data.py50
-rwxr-xr-xwatchnews/fetch.py83
-rw-r--r--watchnews/web.py207
-rw-r--r--watchnews/wsgi.py11
6 files changed, 381 insertions, 0 deletions
diff --git a/watchnews/__init__.py b/watchnews/__init__.py
new file mode 100644
index 0000000..1bb8bf6
--- /dev/null
+++ b/watchnews/__init__.py
@@ -0,0 +1 @@
+# empty
diff --git a/watchnews/css.py b/watchnews/css.py
new file mode 100644
index 0000000..b32a274
--- /dev/null
+++ b/watchnews/css.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python3
+import re
+
+
+class Rule:
+
+ def __init__(self, *path, **properties):
+ self.path = path
+ self.properties = properties
+
+ def _f(self, prop):
+ key, value = prop
+ key = re.sub("([A-Z])", "-\\1", key).lower()
+ return " {}: {};".format(key, value)
+
+ def __format__(self):
+ result = " ".join(self.path)
+ result += " {\n"
+ result += "\n".join(map(self._f, self.properties.items()))
+ result += "\n}\n"
+ return result
+
+
+def string(*rules):
+ return "\n".join(map(lambda r: r.__format__(), rules))
+
+if __name__ == "__main__":
+ print("CSS Demo")
+ print(Rule(".foo", "#blah", backgroundColor="red").__format__())
diff --git a/watchnews/data.py b/watchnews/data.py
new file mode 100644
index 0000000..e4ecc24
--- /dev/null
+++ b/watchnews/data.py
@@ -0,0 +1,50 @@
+import peewee as p
+
+import os
+import datetime
+import re
+
+db = p.Proxy()
+
+class SqliteDatabase(p.SqliteDatabase):
+ foreign_keys = True
+ def _add_conn_hooks(self, conn):
+ p.SqliteDatabase._add_conn_hooks(self, conn)
+ conn.execute('PRAGMA foreign_keys = ON;').fetchone()
+
+def init_sqlite(path):
+ db.initialize(SqliteDatabase(path))
+
+ db.connect()
+ db.create_tables(filter(lambda t: not t.table_exists(),
+ BaseModel.__subclasses__()))
+
+
+class BaseModel(p.Model):
+ class Meta:
+ database = db
+ def __json__(self):
+ return self._data
+ def __repr__(self):
+ return "<{.__class__.__name__} {}>".format(self, " ".join(
+ map(lambda kv: "{0[0]}={0[1]}".format(kv), self._data.items())))
+
+class Feed(BaseModel):
+ created_date = p.DateTimeField(default=datetime.datetime.now)
+ url = p.CharField(unique=True)
+ title = p.CharField(null=True)
+
+class Item(BaseModel):
+ created_date = p.DateTimeField(default=datetime.datetime.now)
+ feed = p.ForeignKeyField(Feed, related_name='items', on_delete='CASCADE')
+ uid = p.CharField()
+ title = p.CharField(null=True)
+
+class Version(BaseModel):
+ created_date = p.DateTimeField(default=datetime.datetime.now)
+ item = p.ForeignKeyField(Item, related_name='versions', on_delete='CASCADE')
+ url = p.CharField()
+ title = p.CharField()
+ authors = p.CharField()
+ text = p.CharField()
+
diff --git a/watchnews/fetch.py b/watchnews/fetch.py
new file mode 100755
index 0000000..5201c1e
--- /dev/null
+++ b/watchnews/fetch.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+from . import data
+
+import feedparser
+from newspaper import Article
+from newspaper.outputformatters import OutputFormatter
+from newspaper.cleaners import DocumentCleaner
+
+import logging
+
+def update(feed):
+ logging.info("Update %s", feed.url)
+ result = feedparser.parse(feed.url)
+ if result['feed']['title'] != feed.title:
+ feed.title = result['feed']['title']
+ feed.save()
+
+ for entry in result['entries']:
+ try:
+ url = entry['link']
+ if 'id' in entry:
+ uid = entry['id']
+ else:
+ uid = entry['link']
+
+ item = data.Item.select() \
+ .where((data.Item.uid == uid) & (data.Item.feed == feed)) \
+ .first()
+ if not item:
+ item = data.Item.create(feed=feed, uid=uid)
+
+
+ paper = Article(entry['link'])
+ paper.download()
+ paper.parse()
+
+ if item.title != paper.title:
+ item.title = paper.title
+ item.save()
+
+ version = data.Version()
+ version.item = item
+ version.uid = uid
+ version.url = entry['link']
+ authors = paper.authors
+ authors.sort()
+ version.authors = ", ".join(authors)
+ version.title = paper.title
+ version.text = paper.text
+
+ # alternative, try if the rss body got a bigger text
+ document_cleaner = DocumentCleaner(paper.config)
+ output_formatter = OutputFormatter(paper.config)
+ # override this method since it doesn't work on summaries
+ output_formatter.links_to_text = lambda: None
+ doc = paper.config.get_parser().fromstring(entry['summary'])
+ doc = document_cleaner.clean(doc)
+ text, article_html = output_formatter.get_formatted(doc)
+
+ if len(text) < 2 and '<' not in entry['summary']:
+ text = entry['summary']
+
+ if len(text) > len(version.text):
+ version.text = text
+
+ if len(version.text) < 2: # less than 2 chars is likely failure
+ raise Exception("failed to parse {}\n{}".format(entry,version))
+
+ # search if the previous version was the same
+ ident_version = data.Version.select().where(
+ (data.Version.item == version.item) &
+ (data.Version.title == version.title) &
+ (data.Version.authors == version.authors) &
+ (data.Version.text == version.text)).first()
+
+ if ident_version:
+ logging.info("No change, skip %s", item.uid)
+ else:
+ version.save()
+ logging.info("Saved new version of %s: %s", item.uid, version.id)
+ except Exception as e:
+ logging.exception("Failed to process %s", entry['link'])
+
diff --git a/watchnews/web.py b/watchnews/web.py
new file mode 100644
index 0000000..438e649
--- /dev/null
+++ b/watchnews/web.py
@@ -0,0 +1,207 @@
+from . import data, css
+
+from ll.xist import xsc, parse
+from ll.xist.ns import html
+from flask import Flask, url_for
+
+import re
+import difflib
+
+
+class DiffSupport:
+
+ def _diff(self, line1, line2):
+ diff = list(difflib._mdiff([line1], [line2]))
+ return (self._format_diff(diff[0][0][1]),
+ self._format_diff(diff[0][1][1]))
+
+ def _diff_lines(self, lines1, lines2):
+ diff = difflib._mdiff(lines1, lines2)
+ rows = []
+ for ((line1, diff1), (line2, diff2), flag) in diff:
+ rows.append(html.tr(
+ html.th(),
+ html.td(line1),
+ html.td(*self._format_diff(diff1)),
+ html.td(line2),
+ html.td(*self._format_diff(diff2))
+ ))
+ return html.table(rows, **{'class': 'textdiff'})
+
+ def _format_diff(self, line):
+ elems = []
+ nextpos = line.find("\x00")
+ while nextpos != -1 and nextpos + 1 < len(line):
+ actionclass = {
+ '+': 'diff_add', '-': 'diff_sub',
+ '^': 'diff_chg'}[line[nextpos + 1]]
+ endpos = line.find("\x01", nextpos + 2)
+
+ if nextpos != 0: # intermediate unchanged text
+ elems += [html.span(line[:nextpos])]
+
+ text = line[nextpos + 2:endpos]
+ elems += [html.span(text, **{'class': actionclass})]
+
+ line = line[endpos:]
+ nextpos = line.find("\x00")
+
+ if line != "": # trailing unchanged text
+ elems += [html.span(line)]
+ return elems
+
+
+class Difftable(html.div, DiffSupport):
+
+ def __init__(self, to_version, from_version=None):
+ super().__init__()
+ if from_version == None:
+ self.single_version(to_version)
+ else:
+ self.two_versions(to_version, from_version)
+
+ def single_version(self, version):
+ self.append(html.table(
+ html.tr(html.th("Title"), html.td(version.title)),
+ html.tr(html.th("Date"), html.td(
+ version.created_date.strftime("%x %X"))),
+ html.tr(html.th("Link"), html.td(
+ html.a(version.url, href=version.url))),
+ html.tr(html.th("Text", colspan=2)),
+ html.tr(html.td(map(html.p, version.text.split("\n")), colspan=2))
+ ))
+
+ def two_versions(self, to_version, from_version):
+ def prepare_text(text):
+ return re.sub("\n\n\n*", "\n\n", text).split("\n")
+ from_text = prepare_text(from_version.text)
+ to_text = prepare_text(to_version.text)
+
+ from_difftitle, to_difftitle = self._diff(
+ from_version.title, to_version.title)
+
+ diff = difflib._mdiff(from_text, to_text)
+ self.append(html.table(
+ html.tr(html.th("Title"),
+ html.td(from_difftitle),
+ html.td(to_difftitle), **{'class': 'textdiff'}),
+ html.tr(html.th("Date"),
+ html.td(from_version.created_date.strftime("%x %X")),
+ html.td(to_version.created_date.strftime("%x %X"))),
+ html.tr(html.th("Link"),
+ html.td(html.a(from_version.url, href=from_version.url)),
+ html.td(html.a(to_version.url, href=to_version.url))),
+ html.tr(html.th("Text", colspan=3)),
+ html.tr(html.td(self._diff_lines(from_text, to_text), colspan=3)),
+ **{'class': "versiondiff"}
+ ))
+
+
+class ItemWidget(html.div, DiffSupport):
+
+ def __init__(self, item):
+ super().__init__()
+ self.append(html.h2(html.a(item.title,
+ href=url_for('item', id=item.id))))
+ versionsFrom = [None] + list(item.versions)[:-1]
+ versionsTo = item.versions
+ versions = list(zip(versionsFrom, versionsTo))
+
+ self.append(html.ul(
+ *map(self.version, versions)))
+
+ def version(self, versions):
+ from_version, to_version = versions
+ if from_version == None:
+ title = html.span(to_version.title)
+ else:
+ from_difftitle, to_difftitle = self._diff(
+ from_version.title, to_version.title)
+ title = html.span(to_difftitle, **{'class': 'textdiff'})
+ return html.li(html.span(to_version.created_date.strftime("%x %X")),
+ html.span(" - "),
+ title)
+
+
+class Template:
+
+ @staticmethod
+ def template(title, body):
+ return xsc.Frag(
+ html.DocTypeXHTML10transitional(), "\n",
+ html.html(
+ html.head(
+ html.meta(charset='utf-8'),
+ html.title(title),
+ html.style(css.string(
+ css.Rule(".textdiff", ".diff_add",
+ backgroundColor="#CEF6CE"),
+ css.Rule(".textdiff", ".diff_next",
+ backgroundColor="#c0c0c0"),
+ css.Rule(".textdiff", ".diff_sub",
+ backgroundColor="#FFDADA"),
+ css.Rule(".textdiff", ".diff_chg",
+ backgroundColor="#E5E1FF")
+ ))
+ ), "\n",
+ html.body(body)))
+
+ @staticmethod
+ def index(feeds):
+ return Template.template(
+ "Index",
+ html.ul(
+ *map(lambda feed: html.li(
+ html.a(feed.title or feed.url, href=url_for('feed', id=feed.id))),
+ feeds)))
+
+ @staticmethod
+ def feed(feed, items):
+ return Template.template(
+ "Feed {}".format(feed.title or feed.url),
+ html.div(
+ html.a("Back", href=url_for('index')),
+ html.h1(feed.title),
+ *map(ItemWidget, items)))
+
+ @staticmethod
+ def item(item, versions):
+ versionsA = versions
+ versionsB = [None] + versions[:-1]
+ versions = list(zip(versionsA, versionsB))
+ return Template.template("Item: {}".format(item.title),
+ html.div(
+ html.a("Back to {}".format(item.feed.title),
+ href=url_for('feed', id=item.feed.id)),
+ html.h1(item.title),
+ *map(lambda versionAB: html.div(
+ Difftable(versionAB[0], versionAB[1]),
+ html.hr()),
+ versions)))
+
+
+def get_app():
+ app = Flask(__name__)
+
+ @app.route('/')
+ def index():
+ return Template.index(data.Feed.select()).string("utf-8")
+
+ @app.route('/feed/<id>')
+ def feed(id):
+ feed = data.Feed.get(data.Feed.id == id)
+ items = data.Item.select() \
+ .where(data.Item.feed == feed) \
+ .order_by(data.Item.created_date.desc())
+ return Template.feed(feed, items).string("utf-8")
+
+ @app.route('/item/<id>')
+ def item(id):
+ item = data.Item.get(data.Item.id == id)
+ versions = data.Version.select() \
+ .where(data.Version.item == item) \
+ .order_by(data.Version.created_date)
+ return Template.item(item, list(versions)).string("utf-8")
+
+ return app
+
diff --git a/watchnews/wsgi.py b/watchnews/wsgi.py
new file mode 100644
index 0000000..bd157cb
--- /dev/null
+++ b/watchnews/wsgi.py
@@ -0,0 +1,11 @@
+from watchnews import web, data
+import os
+
+
+dbpath = os.environ.get("WATCHNEWS_DBPATH")
+if not dbpath:
+ raise Exception("WATCHNEWS_DBPATH must be set")
+
+data.init_sqlite(dbpath)
+application = web.get_app()
+