From 6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0 Mon Sep 17 00:00:00 2001 From: Yves Fischer Date: Sun, 22 Nov 2015 00:25:56 +0100 Subject: project restructure make it a proper python package --- watchnews/fetch.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 watchnews/fetch.py (limited to 'watchnews/fetch.py') diff --git a/watchnews/fetch.py b/watchnews/fetch.py new file mode 100755 index 0000000..5201c1e --- /dev/null +++ b/watchnews/fetch.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +from . import data + +import feedparser +from newspaper import Article +from newspaper.outputformatters import OutputFormatter +from newspaper.cleaners import DocumentCleaner + +import logging + +def update(feed): + logging.info("Update %s", feed.url) + result = feedparser.parse(feed.url) + if result['feed']['title'] != feed.title: + feed.title = result['feed']['title'] + feed.save() + + for entry in result['entries']: + try: + url = entry['link'] + if 'id' in entry: + uid = entry['id'] + else: + uid = entry['link'] + + item = data.Item.select() \ + .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ + .first() + if not item: + item = data.Item.create(feed=feed, uid=uid) + + + paper = Article(entry['link']) + paper.download() + paper.parse() + + if item.title != paper.title: + item.title = paper.title + item.save() + + version = data.Version() + version.item = item + version.uid = uid + version.url = entry['link'] + authors = paper.authors + authors.sort() + version.authors = ", ".join(authors) + version.title = paper.title + version.text = paper.text + + # alternative, try if the rss body got a bigger text + document_cleaner = DocumentCleaner(paper.config) + output_formatter = OutputFormatter(paper.config) + # override this method since it doesn't work on summaries + output_formatter.links_to_text = lambda: None + doc = paper.config.get_parser().fromstring(entry['summary']) + doc = document_cleaner.clean(doc) + text, article_html = output_formatter.get_formatted(doc) + + if len(text) < 2 and '<' not in entry['summary']: + text = entry['summary'] + + if len(text) > len(version.text): + version.text = text + + if len(version.text) < 2: # less than 2 chars is likely failure + raise Exception("failed to parse {}\n{}".format(entry,version)) + + # search if the previous version was the same + ident_version = data.Version.select().where( + (data.Version.item == version.item) & + (data.Version.title == version.title) & + (data.Version.authors == version.authors) & + (data.Version.text == version.text)).first() + + if ident_version: + logging.info("No change, skip %s", item.uid) + else: + version.save() + logging.info("Saved new version of %s: %s", item.uid, version.id) + except Exception as e: + logging.exception("Failed to process %s", entry['link']) + -- cgit v1.2.1