summaryrefslogtreecommitdiff
path: root/watchnews/fetch.py
diff options
context:
space:
mode:
authorYves Fischer <yvesf-git@xapek.org>2015-11-22 00:25:56 +0100
committerYves Fischer <yvesf-git@xapek.org>2015-11-22 00:25:56 +0100
commit6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0 (patch)
tree07b69c3c6a665ef290591e184ca0d2ba3b73fe01 /watchnews/fetch.py
parent49ac3c20cb77b90493ce79b4e31cf0f58cba0116 (diff)
downloadwatchnews-6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0.tar.gz
watchnews-6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0.zip
project restructure
make it a proper python package
Diffstat (limited to 'watchnews/fetch.py')
-rwxr-xr-xwatchnews/fetch.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/watchnews/fetch.py b/watchnews/fetch.py
new file mode 100755
index 0000000..5201c1e
--- /dev/null
+++ b/watchnews/fetch.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+from . import data
+
+import feedparser
+from newspaper import Article
+from newspaper.outputformatters import OutputFormatter
+from newspaper.cleaners import DocumentCleaner
+
+import logging
+
+def update(feed):
+ logging.info("Update %s", feed.url)
+ result = feedparser.parse(feed.url)
+ if result['feed']['title'] != feed.title:
+ feed.title = result['feed']['title']
+ feed.save()
+
+ for entry in result['entries']:
+ try:
+ url = entry['link']
+ if 'id' in entry:
+ uid = entry['id']
+ else:
+ uid = entry['link']
+
+ item = data.Item.select() \
+ .where((data.Item.uid == uid) & (data.Item.feed == feed)) \
+ .first()
+ if not item:
+ item = data.Item.create(feed=feed, uid=uid)
+
+
+ paper = Article(entry['link'])
+ paper.download()
+ paper.parse()
+
+ if item.title != paper.title:
+ item.title = paper.title
+ item.save()
+
+ version = data.Version()
+ version.item = item
+ version.uid = uid
+ version.url = entry['link']
+ authors = paper.authors
+ authors.sort()
+ version.authors = ", ".join(authors)
+ version.title = paper.title
+ version.text = paper.text
+
+ # alternative, try if the rss body got a bigger text
+ document_cleaner = DocumentCleaner(paper.config)
+ output_formatter = OutputFormatter(paper.config)
+ # override this method since it doesn't work on summaries
+ output_formatter.links_to_text = lambda: None
+ doc = paper.config.get_parser().fromstring(entry['summary'])
+ doc = document_cleaner.clean(doc)
+ text, article_html = output_formatter.get_formatted(doc)
+
+ if len(text) < 2 and '<' not in entry['summary']:
+ text = entry['summary']
+
+ if len(text) > len(version.text):
+ version.text = text
+
+ if len(version.text) < 2: # less than 2 chars is likely failure
+ raise Exception("failed to parse {}\n{}".format(entry,version))
+
+ # search if the previous version was the same
+ ident_version = data.Version.select().where(
+ (data.Version.item == version.item) &
+ (data.Version.title == version.title) &
+ (data.Version.authors == version.authors) &
+ (data.Version.text == version.text)).first()
+
+ if ident_version:
+ logging.info("No change, skip %s", item.uid)
+ else:
+ version.save()
+ logging.info("Saved new version of %s: %s", item.uid, version.id)
+ except Exception as e:
+ logging.exception("Failed to process %s", entry['link'])
+