From f9484b4b9d874867309a9697df46b328fb9ee302 Mon Sep 17 00:00:00 2001 From: sammort <29308020+sammort@users.noreply.github.com> Date: Wed, 3 Jan 2024 15:44:21 +0000 Subject: [PATCH 1/2] feat: add Vale of White Horse District Council This is a direct port of the South Oxfordshire District Council scraper as they use the exact same 'Binzone' website, with some values changed to determine the correct council. --- .../features/validate_council_outputs.feature | 1 + uk_bin_collection/tests/input.json | 5 ++ .../councils/ValeofWhiteHorseCouncil.py | 81 +++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py diff --git a/uk_bin_collection/tests/features/validate_council_outputs.feature b/uk_bin_collection/tests/features/validate_council_outputs.feature index 9f446ede22..f444f88514 100644 --- a/uk_bin_collection/tests/features/validate_council_outputs.feature +++ b/uk_bin_collection/tests/features/validate_council_outputs.feature @@ -118,6 +118,7 @@ Feature: Test each council output matches expected results | TorbayCouncil | None | None | | TorridgeDistrictCouncil | None | None | | ValeofGlamorganCouncil | None | None | + | ValeofWhiteHorseCouncil | None | None | | WakefieldCityCouncil | http://selenium:4444 | local | | WarwickDistrictCouncil | None | None | | WaverleyBoroughCouncil | None | None | diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index becae11dd0..40fd5b203a 100644 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -776,6 +776,11 @@ "url": "https://www.valeofglamorgan.gov.uk/en/living/Recycling-and-Waste/", "wiki_name": "Vale of Glamorgan Council" }, + "ValeofWhiteHorseCouncil": { + "uprn": "100121391443", + "url": "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb", + "wiki_name": "Vale of White Horse Council" + }, "WakefieldCityCouncil": { "custom_component_show_url_field": true, "skip_get_url": true, diff --git a/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py b/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py new file mode 100644 index 0000000000..4d152e0bd7 --- /dev/null +++ b/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py @@ -0,0 +1,81 @@ +import requests +from bs4 import BeautifulSoup +from uk_bin_collection.uk_bin_collection.common import * +from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass + + +# import the wonderful Beautiful Soup and the URL grabber +class CouncilClass(AbstractGetBinDataClass): + """ + Concrete classes have to implement all abstract operations of the + base class. They can also override some operations with a default + implementation. + """ + + def parse_data(self, page: str, **kwargs) -> dict: + user_uprn = kwargs.get("uprn") + check_uprn(user_uprn) + + # UPRN is passed in via a cookie. Set cookies/params and GET the page + cookies = { + "SVBINZONE": f"VALE%3AUPRN%40{user_uprn}", + } + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.7", + "Cache-Control": "max-age=0", + "Connection": "keep-alive", + "Referer": "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb?SOVA_TAG=VALE&ebd=0&ebz=1_1704201201813", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "same-origin", + "Sec-Fetch-User": "?1", + "Sec-GPC": "1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", + } + params = { + "SOVA_TAG": "VALE", + "ebd": "0", + } + requests.packages.urllib3.disable_warnings() + response = requests.get( + "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb", + params=params, + headers=headers, + cookies=cookies, + ) + + # Parse response text for super speedy finding + soup = BeautifulSoup(response.text, features="html.parser") + soup.prettify() + + data = {"bins": []} + + # Page has slider info side by side, which are two instances of this class + for bin in soup.find_all("div", {"class": "binextra"}): + bin_info = bin.text.split("-") + try: + # No date validation since year isn't included on webpage + bin_date = get_next_occurrence_from_day_month( + datetime.strptime( + bin_info[0].strip() + " " + datetime.today().strftime("%Y"), + "%A %d %B %Y", + ) + ).strftime(date_format) + bin_type = str.capitalize(bin_info[1].strip()) + except Exception as ex: + raise ValueError(f"Error parsing bin data: {ex}") + + # Build data dict for each entry + dict_data = { + "type": bin_type, + "collectionDate": bin_date, + } + data["bins"].append(dict_data) + + data["bins"].sort( + key=lambda x: datetime.strptime(x.get("collectionDate"), date_format) + ) + + return data From 433a71b6b848505f8cb9bee2863c2d0db1e73e85 Mon Sep 17 00:00:00 2001 From: sammort <29308020+sammort@users.noreply.github.com> Date: Wed, 3 Jan 2024 15:54:49 +0000 Subject: [PATCH 2/2] fix: account for additional string on exceptional schedule Biffa's 'Binzone' website introduces an additional string during exceptional bin collection schedules (e.g. due to English Bank Holidays). For example, currently the site returns these strings in the `binextra` div: - Your usual collection day is different this week - Friday 5 January - - green bin, textiles and food bin - No garden waste collection due to Christmas To help detect wheree the string containing the bin collection date is in the `binextra` div I've introduced a common utility function that leverages dateutil to detect whether a string contains a date or not. --- CONTRIBUTING.md | 1 + uk_bin_collection/uk_bin_collection/common.py | 15 ++++++++++ .../councils/SouthOxfordshireCouncil.py | 28 +++++++++++++------ .../councils/ValeofWhiteHorseCouncil.py | 28 +++++++++++++------ 4 files changed, 54 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8aeae9fbc6..a57969bccb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -156,6 +156,7 @@ Useful functions include: - a function to check [if a date is a holiday](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L117) in a given part of the UK - a function that returns the [dates of a given weekday](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L136) in N amounts of weeks - a function that returns a [list of dates every N days](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L148) from a given start date +- a function to check [if a string contains a date](./uk_bin_collection/uk_bin_collection/common.py#L249) (leverages [dateutil's parser](https://dateutil.readthedocs.io/en/stable/parser.html)) `common.py` also contains a [standardised date format](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L11) variable called `date_format`, which is useful to call when formatting datetimes. diff --git a/uk_bin_collection/uk_bin_collection/common.py b/uk_bin_collection/uk_bin_collection/common.py index 8dc2539e29..63a451b87b 100644 --- a/uk_bin_collection/uk_bin_collection/common.py +++ b/uk_bin_collection/uk_bin_collection/common.py @@ -6,6 +6,7 @@ import re import requests from datetime import datetime +from dateutil.parser import parse from enum import Enum from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService @@ -245,6 +246,20 @@ def validate_dates(bin_dates: dict) -> dict: raise NotImplementedError() # If a date is in December and the next is in January, increase the year +def contains_date(string, fuzzy=False) -> bool: + """ + Return whether the string can be interpreted as a date. + + :param string: str, string to check for date + :param fuzzy: bool, ignore unknown tokens in string if True + """ + try: + parse(string, fuzzy=fuzzy) + return True + + except ValueError: + return False + def create_webdriver(web_driver) -> webdriver.Chrome: """ diff --git a/uk_bin_collection/uk_bin_collection/councils/SouthOxfordshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SouthOxfordshireCouncil.py index 3056e61ca9..e7491520af 100644 --- a/uk_bin_collection/uk_bin_collection/councils/SouthOxfordshireCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/SouthOxfordshireCouncil.py @@ -56,16 +56,26 @@ def parse_data(self, page: str, **kwargs) -> dict: # Page has slider info side by side, which are two instances of this class for bin in soup.find_all("div", {"class": "binextra"}): - bin_info = bin.text.split("-") + bin_info = list(bin.stripped_strings) try: - # No date validation since year isn't included on webpage - bin_date = get_next_occurrence_from_day_month( - datetime.strptime( - bin_info[0].strip() + " " + datetime.today().strftime("%Y"), - "%A %d %B %Y", - ) - ).strftime(date_format) - bin_type = str.capitalize(bin_info[1].strip()) + # On standard collection schedule, date will be contained in the first stripped string + if contains_date(bin_info[0]): + bin_date = get_next_occurrence_from_day_month( + datetime.strptime( + bin_info[0] + " " + datetime.today().strftime("%Y"), + "%A %d %B - %Y", + ) + ).strftime(date_format) + bin_type = str.capitalize(bin_info[1]) + # On exceptional collection schedule (e.g. around English Bank Holidays), date will be contained in the second stripped string + else: + bin_date = get_next_occurrence_from_day_month( + datetime.strptime( + bin_info[1] + " " + datetime.today().strftime("%Y"), + "%A %d %B - %Y", + ) + ).strftime(date_format) + bin_type = str.capitalize(bin_info[2]) except Exception as ex: raise ValueError(f"Error parsing bin data: {ex}") diff --git a/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py b/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py index 4d152e0bd7..79763f12a3 100644 --- a/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py @@ -54,16 +54,26 @@ def parse_data(self, page: str, **kwargs) -> dict: # Page has slider info side by side, which are two instances of this class for bin in soup.find_all("div", {"class": "binextra"}): - bin_info = bin.text.split("-") + bin_info = list(bin.stripped_strings) try: - # No date validation since year isn't included on webpage - bin_date = get_next_occurrence_from_day_month( - datetime.strptime( - bin_info[0].strip() + " " + datetime.today().strftime("%Y"), - "%A %d %B %Y", - ) - ).strftime(date_format) - bin_type = str.capitalize(bin_info[1].strip()) + # On standard collection schedule, date will be contained in the first stripped string + if contains_date(bin_info[0]): + bin_date = get_next_occurrence_from_day_month( + datetime.strptime( + bin_info[0] + " " + datetime.today().strftime("%Y"), + "%A %d %B - %Y", + ) + ).strftime(date_format) + bin_type = str.capitalize(bin_info[1]) + # On exceptional collection schedule (e.g. around English Bank Holidays), date will be contained in the second stripped string + else: + bin_date = get_next_occurrence_from_day_month( + datetime.strptime( + bin_info[1] + " " + datetime.today().strftime("%Y"), + "%A %d %B - %Y", + ) + ).strftime(date_format) + bin_type = str.capitalize(bin_info[2]) except Exception as ex: raise ValueError(f"Error parsing bin data: {ex}")