1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
#!/usr/bin/env python3
import data
import web
import feedparser
from newspaper import Article
from newspaper.outputformatters import OutputFormatter
from newspaper.cleaners import DocumentCleaner
import os
import argparse
import logging
logging.basicConfig(level=logging.INFO)
def update(feed):
logging.info("Update %s", feed.url)
result = feedparser.parse(feed.url)
if result['feed']['title'] != feed.title:
feed.title = result['feed']['title']
feed.save()
for entry in result['entries']:
try:
url = entry['link']
if 'id' in entry:
uid = entry['id']
else:
uid = entry['link']
item = data.Item.select() \
.where((data.Item.uid == uid) & (data.Item.feed == feed)) \
.first()
if not item:
item = data.Item.create(feed=feed, uid=uid)
paper = Article(entry['link'])
paper.download()
paper.parse()
if item.title != paper.title:
item.title = paper.title
item.save()
version = data.Version()
version.item = item
version.uid = uid
version.url = entry['link']
authors = paper.authors
authors.sort()
version.authors = ", ".join(authors)
version.title = paper.title
version.text = paper.text
# alternative, try if the rss body got a bigger text
document_cleaner = DocumentCleaner(paper.config)
output_formatter = OutputFormatter(paper.config)
# override this method since it doesn't work on summaries
output_formatter.links_to_text = lambda: None
doc = paper.config.get_parser().fromstring(entry['summary'])
doc = document_cleaner.clean(doc)
text, article_html = output_formatter.get_formatted(doc)
if len(text) < 2 and '<' not in entry['summary']:
text = entry['summary']
if len(text) > len(version.text):
version.text = text
if len(version.text) < 2: # less than 2 chars is likely failure
raise Exception("failed to parse {}\n{}".format(entry,version))
# search if the previous version was the same
ident_version = data.Version.select().where(
(data.Version.item == version.item) &
(data.Version.title == version.title) &
(data.Version.authors == version.authors) &
(data.Version.text == version.text)).first()
if ident_version:
logging.info("No change, skip %s", item.uid)
else:
version.save()
logging.info("Saved new version of %s: %s", item.uid, version.id)
except Exception as e:
logging.exception("Failed to process %s", entry['link'])
if __name__ == "__main__":
db = data.SqliteDatabase(os.path.join(
os.path.dirname(__file__), "db.sqlite"))
data.db.initialize(db)
data.db.connect()
data.db.create_tables(filter(lambda t: not t.table_exists(),
data.BaseModel.__subclasses__()))
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--add', metavar='URL',
help='add feed with URL')
parser.add_argument('--update', action='store_true',
help='Update all known feeds')
parser.add_argument('--update-feed', type=int, metavar='ID',
help='Update feed with ID')
parser.add_argument('--remove', type=int, metavar='ID',
help='Remove feed with ID')
parser.add_argument('--list', action='store_true',
help='List all registered feeds')
parser.add_argument('--web', action='store_true',
help='Run web interface')
args = parser.parse_args()
if args.update:
for feed in data.Feed.select():
try:
update(feed)
except Exception as e:
logging.exception("Failed to update %s", feed.url)
if args.update_feed:
feed = data.Feed.get(data.Feed.id == args.update_feed)
if feed:
update(feed)
if args.add:
feed = data.Feed.create(url=args.add)
print("Added {}".format(feed))
if args.remove:
feed = data.Feed.get(data.Feed.id == args.remove)
feed.delete_instance()
print("Removed {}".format(feed))
if args.list:
for feed in data.Feed.select().order_by(data.Feed.created_date):
print("{0.id} - {1} - {0.title} - {0.url}".format(
feed, feed.created_date.strftime('%x %X')))
if args.web:
web.run()
|