From 6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0 Mon Sep 17 00:00:00 2001 From: Yves Fischer Date: Sun, 22 Nov 2015 00:25:56 +0100 Subject: project restructure make it a proper python package --- watchnews.py | 137 ----------------------------------------------------------- 1 file changed, 137 deletions(-) delete mode 100755 watchnews.py (limited to 'watchnews.py') diff --git a/watchnews.py b/watchnews.py deleted file mode 100755 index e22b483..0000000 --- a/watchnews.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 -import data -import web - -import feedparser -from newspaper import Article -from newspaper.outputformatters import OutputFormatter -from newspaper.cleaners import DocumentCleaner - -import os -import argparse -import logging - - -logging.basicConfig(level=logging.INFO) - -def update(feed): - logging.info("Update %s", feed.url) - result = feedparser.parse(feed.url) - if result['feed']['title'] != feed.title: - feed.title = result['feed']['title'] - feed.save() - - for entry in result['entries']: - try: - url = entry['link'] - if 'id' in entry: - uid = entry['id'] - else: - uid = entry['link'] - - item = data.Item.select() \ - .where((data.Item.uid == uid) & (data.Item.feed == feed)) \ - .first() - if not item: - item = data.Item.create(feed=feed, uid=uid) - - - paper = Article(entry['link']) - paper.download() - paper.parse() - - if item.title != paper.title: - item.title = paper.title - item.save() - - version = data.Version() - version.item = item - version.uid = uid - version.url = entry['link'] - authors = paper.authors - authors.sort() - version.authors = ", ".join(authors) - version.title = paper.title - version.text = paper.text - - # alternative, try if the rss body got a bigger text - document_cleaner = DocumentCleaner(paper.config) - output_formatter = OutputFormatter(paper.config) - # override this method since it doesn't work on summaries - output_formatter.links_to_text = lambda: None - doc = paper.config.get_parser().fromstring(entry['summary']) - doc = document_cleaner.clean(doc) - text, article_html = output_formatter.get_formatted(doc) - - if len(text) < 2 and '<' not in entry['summary']: - text = entry['summary'] - - if len(text) > len(version.text): - version.text = text - - if len(version.text) < 2: # less than 2 chars is likely failure - raise Exception("failed to parse {}\n{}".format(entry,version)) - - # search if the previous version was the same - ident_version = data.Version.select().where( - (data.Version.item == version.item) & - (data.Version.title == version.title) & - (data.Version.authors == version.authors) & - (data.Version.text == version.text)).first() - - if ident_version: - logging.info("No change, skip %s", item.uid) - else: - version.save() - logging.info("Saved new version of %s: %s", item.uid, version.id) - except Exception as e: - logging.exception("Failed to process %s", entry['link']) - - -if __name__ == "__main__": - db = data.SqliteDatabase(os.path.join( - os.path.dirname(__file__), "db.sqlite")) - data.db.initialize(db) - - data.db.connect() - data.db.create_tables(filter(lambda t: not t.table_exists(), - data.BaseModel.__subclasses__())) - - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument('--add', metavar='URL', - help='add feed with URL') - parser.add_argument('--update', action='store_true', - help='Update all known feeds') - parser.add_argument('--update-feed', type=int, metavar='ID', - help='Update feed with ID') - parser.add_argument('--remove', type=int, metavar='ID', - help='Remove feed with ID') - parser.add_argument('--list', action='store_true', - help='List all registered feeds') - parser.add_argument('--web', action='store_true', - help='Run web interface') - - args = parser.parse_args() - if args.update: - for feed in data.Feed.select(): - try: - update(feed) - except Exception as e: - logging.exception("Failed to update %s", feed.url) - if args.update_feed: - feed = data.Feed.get(data.Feed.id == args.update_feed) - if feed: - update(feed) - if args.add: - feed = data.Feed.create(url=args.add) - print("Added {}".format(feed)) - if args.remove: - feed = data.Feed.get(data.Feed.id == args.remove) - feed.delete_instance() - print("Removed {}".format(feed)) - if args.list: - for feed in data.Feed.select().order_by(data.Feed.created_date): - print("{0.id} - {1} - {0.title} - {0.url}".format( - feed, feed.created_date.strftime('%x %X'))) - if args.web: - web.run() -- cgit v1.2.1