1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
import codecs
import logging
import urllib.parse
import urllib.request
from enum import Enum
from html.parser import HTMLParser
class Product:
def __init__(self):
self.price = ""
self.name = ""
self.sku = None
def __repr__(self):
return "<{} name={} price={} sku={}>".format(
self.__class__, self.name, self.price, self.sku)
State = Enum('State', 'parsing product product_name price idle')
class Parser(HTMLParser):
def error(self, message):
logging.error("Parser error: %s", message)
def __init__(self):
super().__init__()
self.products = []
self.current = None
self.state = State.idle
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.state == State.idle and tag == "tr" and "data-sku" in attrs:
self.current = Product()
self.current.sku = attrs["data-sku"]
self.state = State.parsing
elif self.state == State.parsing and tag == 'h3' and \
"class" in attrs and attrs['class'] == 'product-name':
self.state = State.product_name
elif self.state == State.parsing and tag == 'span' and \
"class" in attrs and attrs['class'] == "price":
self.state = State.price
def handle_endtag(self, tag):
if self.state == State.product_name and tag == 'a':
self.state = State.parsing
elif self.state == State.price and tag == 'span':
self.state = State.parsing
if self.current and self.current.name and \
self.current.price and self.current.sku:
self.current.name = self.current.name.strip()
price = self.current.price
price = price.replace(".", "").replace(",", ".").split("\xa0")[0]
self.current.price = float(price)
self.products += [self.current]
self.current = None
self.state = State.idle
def handle_data(self, data):
if self.state == State.product_name:
self.current.name += data
if self.state == State.price:
self.current.price += data
URL = "http://www.edelmetall-handel.de/quickbuy/twozero/"
def execute():
"""Always fetches full catalog"""
request = urllib.request.Request(URL)
with urllib.request.urlopen(request) as f:
# with open("index.html", 'rb') as f:
f2 = codecs.getreader('utf-8')(f)
f2.errors = 'ignore'
parser = Parser()
for line in f2.readlines():
parser.feed(line)
return parser.products
if __name__ == "__main__":
from pprint import pprint
pprint(execute())
|