summaryrefslogtreecommitdiff
path: root/watchnews.py
diff options
context:
space:
mode:
authorYves Fischer <yvesf-git@xapek.org>2015-11-22 00:25:56 +0100
committerYves Fischer <yvesf-git@xapek.org>2015-11-22 00:25:56 +0100
commit6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0 (patch)
tree07b69c3c6a665ef290591e184ca0d2ba3b73fe01 /watchnews.py
parent49ac3c20cb77b90493ce79b4e31cf0f58cba0116 (diff)
downloadwatchnews-6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0.tar.gz
watchnews-6080b38fb2b6b3c1017bdd34bb7552bc7e26a4a0.zip
project restructure
make it a proper python package
Diffstat (limited to 'watchnews.py')
-rwxr-xr-xwatchnews.py137
1 files changed, 0 insertions, 137 deletions
diff --git a/watchnews.py b/watchnews.py
deleted file mode 100755
index e22b483..0000000
--- a/watchnews.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-import data
-import web
-
-import feedparser
-from newspaper import Article
-from newspaper.outputformatters import OutputFormatter
-from newspaper.cleaners import DocumentCleaner
-
-import os
-import argparse
-import logging
-
-
-logging.basicConfig(level=logging.INFO)
-
-def update(feed):
- logging.info("Update %s", feed.url)
- result = feedparser.parse(feed.url)
- if result['feed']['title'] != feed.title:
- feed.title = result['feed']['title']
- feed.save()
-
- for entry in result['entries']:
- try:
- url = entry['link']
- if 'id' in entry:
- uid = entry['id']
- else:
- uid = entry['link']
-
- item = data.Item.select() \
- .where((data.Item.uid == uid) & (data.Item.feed == feed)) \
- .first()
- if not item:
- item = data.Item.create(feed=feed, uid=uid)
-
-
- paper = Article(entry['link'])
- paper.download()
- paper.parse()
-
- if item.title != paper.title:
- item.title = paper.title
- item.save()
-
- version = data.Version()
- version.item = item
- version.uid = uid
- version.url = entry['link']
- authors = paper.authors
- authors.sort()
- version.authors = ", ".join(authors)
- version.title = paper.title
- version.text = paper.text
-
- # alternative, try if the rss body got a bigger text
- document_cleaner = DocumentCleaner(paper.config)
- output_formatter = OutputFormatter(paper.config)
- # override this method since it doesn't work on summaries
- output_formatter.links_to_text = lambda: None
- doc = paper.config.get_parser().fromstring(entry['summary'])
- doc = document_cleaner.clean(doc)
- text, article_html = output_formatter.get_formatted(doc)
-
- if len(text) < 2 and '<' not in entry['summary']:
- text = entry['summary']
-
- if len(text) > len(version.text):
- version.text = text
-
- if len(version.text) < 2: # less than 2 chars is likely failure
- raise Exception("failed to parse {}\n{}".format(entry,version))
-
- # search if the previous version was the same
- ident_version = data.Version.select().where(
- (data.Version.item == version.item) &
- (data.Version.title == version.title) &
- (data.Version.authors == version.authors) &
- (data.Version.text == version.text)).first()
-
- if ident_version:
- logging.info("No change, skip %s", item.uid)
- else:
- version.save()
- logging.info("Saved new version of %s: %s", item.uid, version.id)
- except Exception as e:
- logging.exception("Failed to process %s", entry['link'])
-
-
-if __name__ == "__main__":
- db = data.SqliteDatabase(os.path.join(
- os.path.dirname(__file__), "db.sqlite"))
- data.db.initialize(db)
-
- data.db.connect()
- data.db.create_tables(filter(lambda t: not t.table_exists(),
- data.BaseModel.__subclasses__()))
-
- parser = argparse.ArgumentParser(description='Process some integers.')
- parser.add_argument('--add', metavar='URL',
- help='add feed with URL')
- parser.add_argument('--update', action='store_true',
- help='Update all known feeds')
- parser.add_argument('--update-feed', type=int, metavar='ID',
- help='Update feed with ID')
- parser.add_argument('--remove', type=int, metavar='ID',
- help='Remove feed with ID')
- parser.add_argument('--list', action='store_true',
- help='List all registered feeds')
- parser.add_argument('--web', action='store_true',
- help='Run web interface')
-
- args = parser.parse_args()
- if args.update:
- for feed in data.Feed.select():
- try:
- update(feed)
- except Exception as e:
- logging.exception("Failed to update %s", feed.url)
- if args.update_feed:
- feed = data.Feed.get(data.Feed.id == args.update_feed)
- if feed:
- update(feed)
- if args.add:
- feed = data.Feed.create(url=args.add)
- print("Added {}".format(feed))
- if args.remove:
- feed = data.Feed.get(data.Feed.id == args.remove)
- feed.delete_instance()
- print("Removed {}".format(feed))
- if args.list:
- for feed in data.Feed.select().order_by(data.Feed.created_date):
- print("{0.id} - {1} - {0.title} - {0.url}".format(
- feed, feed.created_date.strftime('%x %X')))
- if args.web:
- web.run()