From 65a19f4c5fdeea6b4afa12d138f9319a063d618f Mon Sep 17 00:00:00 2001 From: Yves Fischer Date: Fri, 9 Dec 2016 21:19:19 +0100 Subject: add --test to test newspaper3k text extraction --- watchnews-cli | 5 ++++- watchnews/fetch.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/watchnews-cli b/watchnews-cli index f3c080a..b43d1c2 100755 --- a/watchnews-cli +++ b/watchnews-cli @@ -17,6 +17,8 @@ if __name__ == '__main__': help='Update all known feeds') parser.add_argument('--update-feed', type=int, metavar='ID', help='Update feed with ID') + parser.add_argument('--test', type=str, metavar='URL', + help='Fetch URL with newspaper library and dump result for testing') parser.add_argument('--remove', type=int, metavar='ID', help='Remove feed with ID') parser.add_argument('--list', action='store_true', @@ -60,5 +62,6 @@ if __name__ == '__main__': web.get_app().run(debug=args.web_debug) if args.rss: print(rss.rss()) - + if args.test: + fetch.dump_paper(args.test) diff --git a/watchnews/fetch.py b/watchnews/fetch.py index e6c5c39..c476be7 100755 --- a/watchnews/fetch.py +++ b/watchnews/fetch.py @@ -8,6 +8,20 @@ from newspaper.cleaners import DocumentCleaner import logging +def dump_paper(link): + paper = Article(link) + paper.download() + paper.parse() + print("""\ +Article Dump: +Title: {title} +Url: {url} +Authors: {authors} +Text: {text} +----- + +""".format(**paper.__dict__)) + def update(feed): logging.info('Update %s', feed.url) result = feedparser.parse(feed.url) -- cgit v1.2.1