blob: c476be76e1d35e8cfac9e3cc35a607a981204dbf (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
#!/usr/bin/env python3
from . import data
import feedparser
from newspaper import Article
from newspaper.outputformatters import OutputFormatter
from newspaper.cleaners import DocumentCleaner
import logging
def dump_paper(link):
paper = Article(link)
paper.download()
paper.parse()
print("""\
Article Dump:
Title: {title}
Url: {url}
Authors: {authors}
Text: {text}
-----
""".format(**paper.__dict__))
def update(feed):
logging.info('Update %s', feed.url)
result = feedparser.parse(feed.url)
if result['feed']['title'] != feed.title:
feed.title = result['feed']['title']
feed.save()
for entry in result['entries']:
try:
url = entry['link']
if 'id' in entry:
uid = entry['id']
else:
uid = entry['link']
item = data.Item.select() \
.where((data.Item.uid == uid) & (data.Item.feed == feed)) \
.first()
if not item:
item = data.Item.create(feed=feed, uid=uid)
paper = Article(entry['link'])
paper.download()
paper.parse()
if item.title != paper.title:
item.title = paper.title
item.save()
version = data.Version()
version.item = item
version.uid = uid
version.url = entry['link']
authors = paper.authors
authors.sort()
version.authors = ', '.join(authors)
version.title = paper.title
version.text = paper.text
# alternative, try if the rss body got a bigger text
document_cleaner = DocumentCleaner(paper.config)
output_formatter = OutputFormatter(paper.config)
# override this method since it doesn't work on summaries
output_formatter.links_to_text = lambda: None
doc = paper.config.get_parser().fromstring(entry['summary'])
doc = document_cleaner.clean(doc)
text, article_html = output_formatter.get_formatted(doc)
if len(text) < 2 and '<' not in entry['summary']:
text = entry['summary']
if len(text) > len(version.text):
version.text = text
if len(version.text) < 2: # less than 2 chars is likely failure
raise Exception('failed to parse {}\n{}'.format(entry,version))
# search if the previous version was the same
ident_version = data.Version.select().where(
(data.Version.item == version.item) &
(data.Version.title == version.title) &
(data.Version.authors == version.authors) &
(data.Version.text == version.text)).first()
if ident_version:
logging.info('No change, skip %s', item.uid)
else:
version.save()
logging.info('Saved new version of %s: %s', item.uid, version.id)
except Exception as e:
logging.exception('Failed to process %s', entry['link'])
|