diff options
author | Yves Fischer <yvesf-git@xapek.org> | 2018-07-17 12:34:04 +0200 |
---|---|---|
committer | Yves Fischer <yvesf-git@xapek.org> | 2018-07-17 12:49:49 +0200 |
commit | 69c220cda3d8c0a95327630f5752dad36cb82261 (patch) | |
tree | fad9fbe78cf717f4dd17b8e9d996ab9a54b7d3e2 /jobs/clever_tanken.py | |
download | datasources-69c220cda3d8c0a95327630f5752dad36cb82261.tar.gz datasources-69c220cda3d8c0a95327630f5752dad36cb82261.zip |
Squashed commit
Diffstat (limited to 'jobs/clever_tanken.py')
-rwxr-xr-x | jobs/clever_tanken.py | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/jobs/clever_tanken.py b/jobs/clever_tanken.py new file mode 100755 index 0000000..8418b11 --- /dev/null +++ b/jobs/clever_tanken.py @@ -0,0 +1,114 @@ +import codecs +import logging +from enum import Enum +from html.parser import HTMLParser +from urllib import request + +State = Enum('State', 'fuel_name fuel_price station_name idle') + + +class Tankstelle: + def __init__(self): + self.name = "" + self.preise = {} + self.id = None + + def __repr__(self): + return "{}: {} {}".format(type(self).__name__, self.name, self.preise) + + +class Parser(HTMLParser): + def error(self, message): + logging.error("Parser error: %s", message) + + def __init__(self): + super().__init__() + self.tankstelle = Tankstelle() + self._current_fuel_name = None + self._state = State.idle + + def get_prix(self): + for key, value in self.tankstelle.preise.items(): + self.tankstelle.preise[key] = float(value) + return self.tankstelle + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if self._state == State.idle: + if tag == "div" and attrs.get('class') == 'fuel-price-type': + self._state = State.fuel_name + self._current_fuel_name = "" + if tag == "span" and (attrs.get('id') == "main-content-fuel-station-header-name" + or attrs.get('itemprop') == "http://schema.org/addressCountry"): + self._state = State.station_name + elif self._current_fuel_name is not None and tag == "span" and attrs.get('ng-bind') == "display_preis": + self._state = State.fuel_price + + def handle_endtag(self, tag): + if self._state == State.fuel_name and tag in ('span', 'div'): + self._state = State.idle + elif self._state == State.station_name and tag in ('span'): + self._state = State.idle + elif self._state == State.fuel_price and tag == 'span': + self._state = State.idle + preis = self.tankstelle.preise[self._current_fuel_name].strip() + if preis == "": + del self.tankstelle.preise[self._current_fuel_name] + else: + self.tankstelle.preise[self._current_fuel_name] = float(preis) + self._current_fuel_name = None + + def handle_data(self, data: str): + if self._state == State.fuel_name: + self._current_fuel_name += data.strip().replace(':', '') + self.tankstelle.preise[self._current_fuel_name] = "" + elif self._state == State.fuel_price: + self.tankstelle.preise[self._current_fuel_name] += data + elif self._state == State.station_name: + if len(data.strip()) > 0: + if len(self.tankstelle.name) > 0: + self.tankstelle.name += " " + self.tankstelle.name += data.strip() + + +URL = "http://www.clever-tanken.de/tankstelle_details/" + + +def execute(station_id: str): + parser = Parser() + r = request.Request(URL + station_id) + r.add_header('Host', 'www.clever-tanken.de') + r.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0') + try: + with request.urlopen(r) as f: + f2 = codecs.getreader('utf-8')(f) + f2.errors = 'ignore' + for line in f2.readlines(): + parser.feed(line) + + tankstelle = parser.tankstelle + tankstelle.id = station_id + return tankstelle + except Exception as e: + logging.error("Failed for station: %s", station_id) + raise e + + + +if __name__ == "__main__": + from pprint import pprint + + pprint(list(map(execute, [ + '20219', '11985', '17004', + '19715', # Kaiserst. Mineralölvertrieb Schwärzle + '54296', # ESSO Endingen + '10355', # ARAL Tiengen + '20144', # bft Rankackerweg + '27534', # EXTROL Freiburg + '55690', # Rheinmünster + '15220', # Esso Achern + '5853', # JET Rastatt + '24048', # Bodersweier + '27534', + '3819']) # JET Freiburg + )) |