From 7e6068c563c5d100c81afe2340d7ae3af121ce6f Mon Sep 17 00:00:00 2001 From: David Park Date: Tue, 17 Dec 2024 18:42:49 +0000 Subject: [PATCH 01/17] fix: #1087 - Food waste date incorrect for West Berkshire Council --- .../uk_bin_collection/councils/WestBerkshireCouncil.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py index 8693a13592..61da5d4e01 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py @@ -6,6 +6,7 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import Select from selenium.webdriver.support.wait import WebDriverWait +from webdriver_manager.chrome import ChromeDriverManager from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass @@ -99,7 +100,7 @@ def parse_data(self, page: str, **kwargs) -> dict: ).replace(year=datetime.now().year) food_div = soup.find( - "div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"} + "div", {"id": "FINDYOURBINDAYS_FOODWASTEDATE_OUTERDIV"} ) food_date = food_div.find_all("div")[2] if food_date.text == "Today": From 16bcd1052cf726d17be63881e5b085f52ab700c5 Mon Sep 17 00:00:00 2001 From: David Park Date: Tue, 17 Dec 2024 18:44:54 +0000 Subject: [PATCH 02/17] fix: remove WDM import --- .../uk_bin_collection/councils/WestBerkshireCouncil.py | 1 - 1 file changed, 1 deletion(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py index 61da5d4e01..36908175f3 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py @@ -6,7 +6,6 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import Select from selenium.webdriver.support.wait import WebDriverWait -from webdriver_manager.chrome import ChromeDriverManager from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass From 80d3fc4bd56f248c221339ffcd843616c8f07ec2 Mon Sep 17 00:00:00 2001 From: David Park Date: Tue, 17 Dec 2024 21:51:02 +0000 Subject: [PATCH 03/17] fix: #1089 - Correct shifted dates in Bromley Borough Council --- .../uk_bin_collection/councils/BromleyBoroughCouncil.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py index 93f03c3df1..e116b7d930 100644 --- a/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py @@ -21,7 +21,6 @@ class CouncilClass(AbstractGetBinDataClass): """ def parse_data(self, page: str, **kwargs) -> dict: - # Make a BS4 object driver = None try: bin_data_dict = {"bins": []} @@ -76,12 +75,13 @@ def parse_data(self, page: str, **kwargs) -> dict: # Get the current year current_year = datetime.now().year + # Append the year to the date + date_with_year = date_object.replace(year=current_year) + # Check if the parsed date is in the past compared to the current date if date_object < datetime.now(): # If the parsed date is in the past, assume it's for the next year current_year += 1 - # Append the year to the date - date_with_year = date_object.replace(year=current_year) # Format the date with the year date_with_year_formatted = date_with_year.strftime( From 2f3c18db75914861ae2570f966dafc457f7791a4 Mon Sep 17 00:00:00 2001 From: David Park Date: Tue, 17 Dec 2024 22:13:46 +0000 Subject: [PATCH 04/17] fix: replace West Lindsey's input with working address --- uk_bin_collection/tests/input.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index 58df932495..8b1919dd81 100644 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -1963,8 +1963,8 @@ "wiki_note": "You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN." }, "WestLindseyDistrictCouncil": { - "house_number": "PRIVATE ACCOMMODATION", - "postcode": "LN8 2AR", + "house_number": "35", + "postcode": "LN8 3AX", "skip_get_url": true, "url": "https://www.west-lindsey.gov.uk/", "wiki_name": "West Lindsey District Council", From 1e5e7fe92aaf5ac301f507d4c28f31fd526c81ba Mon Sep 17 00:00:00 2001 From: David Park Date: Tue, 17 Dec 2024 22:14:36 +0000 Subject: [PATCH 05/17] fix: correct date/year logic for West Lindsey District Council --- .../councils/WestLindseyDistrictCouncil.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py index b4e9fd0408..4ba4068995 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py @@ -92,13 +92,14 @@ def parse_data(self, page: str, **kwargs) -> dict: for bin_date in bin_dates: # Split the bin date into day and month and build a full date with the current year split_date = bin_date.split("/") + if len(split_date[0]) < 1: + raise ValueError("Error parsing dates retrieved from website") full_date = datetime( datetime.now().year, int(split_date[1]), int(split_date[0]) ) - # If the current month is December and one of the next collections is in January, increment the year - if datetime.now().month == 12 and int(split_date[1]) == 1: - full_date = bin_date + relativedelta(years=1) + if datetime.now().month == 12 and int(split_date[1]) < 12: + full_date = datetime(year=datetime.now().year + 1, month=int(split_date[1]), day=int(split_date[0])) # Since data in unordered, add to a tuple collections.append((bin_type.title(), full_date)) From f057564f8705ddaed1d119e6127bb591c65791b0 Mon Sep 17 00:00:00 2001 From: m26dvd <31007572+m26dvd@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:38:06 +0000 Subject: [PATCH 06/17] fix: Swale Borough Council fix: #1080 (cherry picked from commit 6f580b39fb68b8079990221e050ae8dd6d2b7285) --- .../councils/SwaleBoroughCouncil.py | 60 +++++++++++++------ 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py index 3d607961cb..28202f9fa9 100644 --- a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py @@ -26,7 +26,7 @@ def parse_data(self, page: str, **kwargs) -> dict: check_paon(user_paon) # Build URL to parse - council_url = "https://swale.gov.uk/bins-littering-and-the-environment/bins/my-collection-day" + council_url = "https://swale.gov.uk/bins-littering-and-the-environment/bins/check-your-bin-day" # Create Selenium webdriver driver = create_webdriver(web_driver, headless, None, __name__) @@ -35,7 +35,7 @@ def parse_data(self, page: str, **kwargs) -> dict: # Wait for the postcode field to appear then populate it try: inputElement_postcode = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.ID, "q462406_q1")) + EC.presence_of_element_located((By.ID, "q485476_q1")) ) inputElement_postcode.send_keys(user_postcode) except Exception: @@ -43,7 +43,7 @@ def parse_data(self, page: str, **kwargs) -> dict: # Click search button findAddress = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.ID, "form_email_462397_submit")) + EC.presence_of_element_located((By.ID, "form_email_485465_submit")) ) driver.execute_script("arguments[0].click();", findAddress) @@ -52,7 +52,7 @@ def parse_data(self, page: str, **kwargs) -> dict: EC.element_to_be_clickable( ( By.XPATH, - "//select[@id='SBCYBDAddressList']//option[contains(., '" + "//select[@name='q485480:q1']//option[contains(., '" + user_paon + "')]", ) @@ -61,12 +61,12 @@ def parse_data(self, page: str, **kwargs) -> dict: # Click search button getBins = WebDriverWait(driver, 10).until( - EC.presence_of_element_located((By.ID, "form_email_462397_submit")) + EC.presence_of_element_located((By.ID, "form_email_485465_submit")) ) driver.execute_script("arguments[0].click();", getBins) BinTable = WebDriverWait(driver, 30).until( - EC.presence_of_element_located((By.ID, "SBC-YBD-Main")) + EC.presence_of_element_located((By.ID, "SBCYBDSummary")) ) soup = BeautifulSoup(driver.page_source, features="html.parser") @@ -74,17 +74,41 @@ def parse_data(self, page: str, **kwargs) -> dict: data = {"bins": []} - # Get the collection bullet points on the page and parse them - nextCollections = soup.find("div", {"id": "nextCollections"}) - for c in nextCollections: - collection = c.find_all("strong") - for bin in collection: - split = (bin.text).split(" on ") - bin_type = split[0] - bin_date = datetime.strptime(split[1], "%A %d %b %Y").strftime( - "%d/%m/%Y" - ) - dict_data = {"type": bin_type, "collectionDate": bin_date} - data["bins"].append(dict_data) + next_collection_date = soup.find( + "strong", id="SBC-YBD-collectionDate" + ).text.strip() + + # Extract bins for the next collection + next_bins = [li.text.strip() for li in soup.select("#SBCFirstBins ul li")] + + # Extract future collection details + future_collection_date_tag = soup.find( + "p", text=lambda t: t and "starting from" in t + ) + future_collection_date = ( + future_collection_date_tag.text.split("starting from")[-1].strip() + if future_collection_date_tag + else "No future date found" + ) + + future_bins = [li.text.strip() for li in soup.select("#FirstFutureBins li")] + + for bin in next_bins: + dict_data = { + "type": bin, + "collectionDate": datetime.strptime( + next_collection_date, "%A, %d %B" + ).strftime(date_format), + } + data["bins"].append(dict_data) + + for bin in future_bins: + dict_data = { + "type": bin, + "collectionDate": datetime.strptime( + future_collection_date, "%A, %d %B" + ).strftime(date_format), + } + data["bins"].append(dict_data) return data From b72faca9ce84791ae076ca0e24a4bdb5e0f8ecad Mon Sep 17 00:00:00 2001 From: m26dvd <31007572+m26dvd@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:36:26 +0000 Subject: [PATCH 07/17] fix: London Borough Sutton fix: #1076 (cherry picked from commit 1eab20c9a57c9c4438ea343f374202bb2e9b98ca) --- .../councils/LondonBoroughSutton.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/LondonBoroughSutton.py b/uk_bin_collection/uk_bin_collection/councils/LondonBoroughSutton.py index 9571a27c12..8537e1ac32 100644 --- a/uk_bin_collection/uk_bin_collection/councils/LondonBoroughSutton.py +++ b/uk_bin_collection/uk_bin_collection/councils/LondonBoroughSutton.py @@ -42,14 +42,18 @@ def parse_data(self, page: str, **kwargs) -> dict: bin_type = service.get_text( strip=True ) # Bin type name (e.g., 'Food waste', 'Mixed recycling') - if bin_type == "Bulky waste": + if bin_type == "Bulky Waste": continue service_details = service.find_next("div", class_="govuk-grid-row") next_collection = ( - service_details.find("dt", string="Next collection") - .find_next_sibling("dd") - .get_text(strip=True) + ( + service_details.find("dt", string="Next collection") + .find_next_sibling("dd") + .get_text(strip=True) + ) + .replace("(this collection has been adjusted from its usual time)", "") + .strip() ) next_collection = datetime.strptime( From 7923cded427bf987476fa0b01b116700cc61506a Mon Sep 17 00:00:00 2001 From: m26dvd <31007572+m26dvd@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:49:32 +0000 Subject: [PATCH 08/17] fix: Merton Council (cherry picked from commit 8e70e6aaffa19c6916c96f48af990e3ebe4da462) --- .../councils/MertonCouncil.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py b/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py index b7b994983d..0912ce3aee 100644 --- a/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py @@ -1,5 +1,6 @@ # This script pulls (in one hit) the data from Merton Council Bins Data from bs4 import BeautifulSoup + from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass @@ -33,6 +34,11 @@ def parse_data(self, page: str, **kwargs) -> dict: ), ) + possible_formats = [ + "%d %B %Y", + "%A %d %B %Y", + ] + # Loops the Rows for row in rows: # Get all the cells @@ -40,9 +46,15 @@ def parse_data(self, page: str, **kwargs) -> dict: # First cell is the bin_type bin_type = cells[0].get_text().strip() # Date is on the second cell, second paragraph, wrapped in p - collectionDate = datetime.strptime( - cells[1].select("p > b")[2].get_text(strip=True), "%d %B %Y" - ) + collectionDate = None + for date_format in possible_formats: + try: + collectionDate = datetime.strptime( + cells[1].select("p > b")[2].get_text(strip=True), date_format + ) + break # Exit the loop if parsing is successful + except ValueError: + continue # Add each collection to the list as a tuple collections.append((bin_type, collectionDate)) From 3a6716dafa63bf46d2ffd0097c0a150a6b848f35 Mon Sep 17 00:00:00 2001 From: David Park Date: Wed, 18 Dec 2024 23:08:06 +0000 Subject: [PATCH 09/17] fix: correct date logic for Swale Borough Council --- .../councils/SwaleBoroughCouncil.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py index 28202f9fa9..665661e4ac 100644 --- a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py @@ -6,8 +6,18 @@ from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass -# import the wonderful Beautiful Soup and the URL grabber +def parse_collection_date(date_string) -> datetime: + now = datetime.now() + if date_string == "is due today": + return now + + parsed_date = datetime.strptime(date_string, "%A, %d %B").replace(year=now.year) + + if now.month == 12 and parsed_date.month < 12: + parsed_date = parsed_date.replace(year=(now.year + 1)) + + return parsed_date class CouncilClass(AbstractGetBinDataClass): """ @@ -79,7 +89,7 @@ def parse_data(self, page: str, **kwargs) -> dict: ).text.strip() # Extract bins for the next collection - next_bins = [li.text.strip() for li in soup.select("#SBCFirstBins ul li")] + next_bins = [li.text.strip().capitalize() for li in soup.select("#SBCFirstBins ul li")] # Extract future collection details future_collection_date_tag = soup.find( @@ -91,23 +101,19 @@ def parse_data(self, page: str, **kwargs) -> dict: else "No future date found" ) - future_bins = [li.text.strip() for li in soup.select("#FirstFutureBins li")] + future_bins = [li.text.strip().capitalize() for li in soup.select("#FirstFutureBins li")] for bin in next_bins: dict_data = { "type": bin, - "collectionDate": datetime.strptime( - next_collection_date, "%A, %d %B" - ).strftime(date_format), + "collectionDate": parse_collection_date(next_collection_date).strftime(date_format), } data["bins"].append(dict_data) for bin in future_bins: dict_data = { "type": bin, - "collectionDate": datetime.strptime( - future_collection_date, "%A, %d %B" - ).strftime(date_format), + "collectionDate": parse_collection_date(future_collection_date).strftime(date_format), } data["bins"].append(dict_data) From 5ad9a195013e9d1550d5618d76384c4c7436a772 Mon Sep 17 00:00:00 2001 From: David Park Date: Wed, 18 Dec 2024 23:13:50 +0000 Subject: [PATCH 10/17] fix: change date format to project default for Merton Council --- uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py b/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py index 0912ce3aee..e84066b009 100644 --- a/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/MertonCouncil.py @@ -47,10 +47,10 @@ def parse_data(self, page: str, **kwargs) -> dict: bin_type = cells[0].get_text().strip() # Date is on the second cell, second paragraph, wrapped in p collectionDate = None - for date_format in possible_formats: + for format in possible_formats: try: collectionDate = datetime.strptime( - cells[1].select("p > b")[2].get_text(strip=True), date_format + cells[1].select("p > b")[2].get_text(strip=True), format ) break # Exit the loop if parsing is successful except ValueError: From 88e3ae833d3d19a947b92ff52677bc95a0d6c8ee Mon Sep 17 00:00:00 2001 From: m26dvd <31007572+m26dvd@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:42:42 +0000 Subject: [PATCH 11/17] feat: Adding Wandsworth Council fix: #1078 (cherry picked from commit 89d93666bb659010d1c130b98c1d81c6ff80cf7c) --- uk_bin_collection/tests/input.json | 7 ++ .../councils/WandsworthCouncil.py | 74 +++++++++++++++++++ wiki/Councils.md | 12 +++ 3 files changed, 93 insertions(+) create mode 100644 uk_bin_collection/uk_bin_collection/councils/WandsworthCouncil.py diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index 8b1919dd81..1fbeba68ff 100644 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -1897,6 +1897,13 @@ "wiki_name": "Waltham Forest", "wiki_note": "Use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find your UPRN." }, + "WandsworthCouncil": { + "url": "https://www.wandsworth.gov.uk", + "wiki_command_url_override": "https://www.wandsworth.gov.uk", + "uprn": "100022684035", + "wiki_name": "Wandsworth Council", + "wiki_note": "You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN." + }, "WarringtonBoroughCouncil": { "url": "https://www.warrington.gov.uk", "wiki_command_url_override": "https://www.warrington.gov.uk", diff --git a/uk_bin_collection/uk_bin_collection/councils/WandsworthCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WandsworthCouncil.py new file mode 100644 index 0000000000..00ec7b6e1c --- /dev/null +++ b/uk_bin_collection/uk_bin_collection/councils/WandsworthCouncil.py @@ -0,0 +1,74 @@ +import requests +from bs4 import BeautifulSoup + +from uk_bin_collection.uk_bin_collection.common import * +from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass + + +# import the wonderful Beautiful Soup and the URL grabber +class CouncilClass(AbstractGetBinDataClass): + """ + Concrete classes have to implement all abstract operations of the + base class. They can also override some operations with a default + implementation. + """ + + def parse_data(self, page: str, **kwargs) -> dict: + + user_uprn = kwargs.get("uprn") + check_uprn(user_uprn) + bindata = {"bins": []} + + URI = f"https://www.wandsworth.gov.uk/my-property/?UPRN={user_uprn}" + + # Make the GET request + response = requests.get(URI) + + soup = BeautifulSoup(response.content, features="html.parser") + soup.prettify() + + # Find all collection types + collection_types = soup.find_all("h4", class_="collection-heading") + + # Iterate over each collection type + for collection_type in collection_types: + bin_types = collection_type.text.strip().split("/") + collections = collection_type.find_next_sibling("div", class_="collections") + + # Extract next and previous collections + next_collection = collections.find_all("div", class_="collection") + + # Parse each collection + for collection in next_collection: + # Extract the collection type (Next or Previous) + strong_tag = collection.find("strong") + collection_type = ( + strong_tag.text.strip(":") if strong_tag else "Unknown" + ) + + # Extract the date + date_text = ( + strong_tag.next_sibling.strip() + if strong_tag and strong_tag.next_sibling + else "No date found" + ) + + if date_text == "No date found": + continue + + for bin_type in bin_types: + # Append to the schedule + dict_data = { + "type": bin_type, + "collectionDate": datetime.strptime( + date_text, + "%A %d %B %Y", + ).strftime(date_format), + } + bindata["bins"].append(dict_data) + + bindata["bins"].sort( + key=lambda x: datetime.strptime(x.get("collectionDate"), date_format) + ) + + return bindata diff --git a/wiki/Councils.md b/wiki/Councils.md index 3403f61634..a11d2527ec 100644 --- a/wiki/Councils.md +++ b/wiki/Councils.md @@ -255,6 +255,7 @@ This document is still a work in progress, don't worry if your council isn't lis - [Wakefield City Council](#wakefield-city-council) - [Walsall Council](#walsall-council) - [Waltham Forest](#waltham-forest) +- [Wandsworth Council](#wandsworth-council) - [Warrington Borough Council](#warrington-borough-council) - [Warwick District Council](#warwick-district-council) - [Watford Borough Council](#watford-borough-council) @@ -3237,6 +3238,17 @@ Note: Use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find your U --- +### Wandsworth Council +```commandline +python collect_data.py WandsworthCouncil https://www.wandsworth.gov.uk -u XXXXXXXX +``` +Additional parameters: +- `-u` - UPRN + +Note: You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN. + +--- + ### Warrington Borough Council ```commandline python collect_data.py WarringtonBoroughCouncil https://www.warrington.gov.uk -u XXXXXXXX From c161f0df332d77a1eace5fecdf44a798de3aa03e Mon Sep 17 00:00:00 2001 From: David Park Date: Wed, 18 Dec 2024 23:20:52 +0000 Subject: [PATCH 12/17] fix: add missing backticks to separate colour config and standard usage instructions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48ae4e7308..e1484ae237 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ If you miss this on the first setup you can reconfigure it. "color": "blue" } } - +``` --- ## Standalone Usage From 53de60085ffcf09be120d136ebdadc7b42fdb5b4 Mon Sep 17 00:00:00 2001 From: David Park Date: Wed, 18 Dec 2024 23:45:47 +0000 Subject: [PATCH 13/17] feat: #1067 - Add garden bin collections where available for Norwich City Council --- .../councils/NorwichCityCouncil.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/NorwichCityCouncil.py b/uk_bin_collection/uk_bin_collection/councils/NorwichCityCouncil.py index 2ad26dd7c8..3cc907459c 100644 --- a/uk_bin_collection/uk_bin_collection/councils/NorwichCityCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/NorwichCityCouncil.py @@ -48,15 +48,13 @@ def parse_data(self, page: str, **kwargs) -> dict: alternateCheck = False strong = soup.find_all("strong") + collections = [] if alternateCheck: bin_types = strong[2].text.strip().replace(".", "").split(" and ") for bin in bin_types: - dict_data = { - "type": bin, - "collectionDate": strong[1].text.strip(), - } - bindata["bins"].append(dict_data) + collections.append((bin.capitalize(), datetime.strptime(strong[1].text.strip(), date_format))) + else: p_tag = soup.find_all("p") i = 1 @@ -65,11 +63,18 @@ def parse_data(self, page: str, **kwargs) -> dict: p.text.split("Your ")[1].split(" is collected")[0].split(" and ") ) for bin in bin_types: - dict_data = { - "type": bin, - "collectionDate": strong[i].text.strip(), - } - bindata["bins"].append(dict_data) + collections.append((bin.capitalize(), datetime.strptime(strong[1].text.strip(), date_format))) i += 2 + if len(strong) > 3: + collections.append(("Garden", datetime.strptime(strong[4].text.strip(), date_format))) + + ordered_data = sorted(collections, key=lambda x: x[1]) + for item in ordered_data: + dict_data = { + "type": item[0] + " bin", + "collectionDate": item[1].strftime(date_format), + } + bindata["bins"].append(dict_data) + return bindata From b1d83588e8de23392ab76ae231282743d2e2145a Mon Sep 17 00:00:00 2001 From: David Park Date: Thu, 19 Dec 2024 00:43:11 +0000 Subject: [PATCH 14/17] fix: #1073 - change method of generating bin types to avoid manual mapping for Rugby Borough Council --- .../uk_bin_collection/councils/RugbyBoroughCouncil.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py index f0e91b37aa..52ef26aa5a 100644 --- a/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py @@ -13,11 +13,6 @@ class CouncilClass(AbstractGetBinDataClass): def parse_data(self, page: str, **kwargs) -> dict: data = {"bins": []} - bin_types = { - "240L RUBBISH BIN": "Black bin", - "240L GARDEN BIN": "Green bin", - "180L RECYCLING BIN": "Blue lid bin", - } collections = [] user_postcode = kwargs.get("postcode") @@ -73,7 +68,8 @@ def parse_data(self, page: str, **kwargs) -> dict: for row in table_rows: row_text = row.text.strip().split("\n") - bin_type = bin_types.get(row_text[0]) + bin_text = row_text[0].split(" ") + bin_type = ' '.join(bin_text[1:]).capitalize() collections.append( (bin_type, datetime.strptime(row_text[1], "%A %d %b %Y")) ) From 048b75fc53c87605916f41f7d947bb9b6c8815a1 Mon Sep 17 00:00:00 2001 From: David Park Date: Thu, 19 Dec 2024 11:28:14 +0000 Subject: [PATCH 15/17] feat: #1063 - rewrite Kirklees Council parser for new website --- .../councils/KirkleesCouncil.py | 117 ++++++++++-------- 1 file changed, 64 insertions(+), 53 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py b/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py index fd0a299e64..238c978f6c 100644 --- a/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py @@ -1,12 +1,17 @@ +import time from datetime import datetime from typing import Optional +from bs4 import BeautifulSoup from selenium.common import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.webdriver import WebDriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait +from webdriver_manager.drivers.chrome import ChromeDriver + +from selenium import webdriver from uk_bin_collection.uk_bin_collection.common import create_webdriver from uk_bin_collection.uk_bin_collection.common import date_format @@ -55,78 +60,84 @@ def _parse_data(self, page: str, **kwargs) -> dict: - Extract info from the 'alt' attribute of the images on that page """ - bins = [] + data = {"bins": []} + collections = [] user_paon = kwargs["paon"] user_postcode = kwargs["postcode"] - self._driver = driver = create_webdriver( - web_driver=kwargs["web_driver"], - headless=kwargs.get("headless", True), - session_name=__name__, - ) + self._driver = driver = webdriver.Chrome() + # self._driver = driver = create_webdriver( + # web_driver=kwargs["web_driver"], + # headless=kwargs.get("headless", True), + # session_name=__name__, + # ) driver.implicitly_wait(1) driver.get( - "https://www.kirklees.gov.uk/beta/your-property-bins-recycling/your-bins/default.aspx" + "https://my.kirklees.gov.uk/service/Bins_and_recycling___Manage_your_bins" ) - wait_for_element( - driver, By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises" - ) + time.sleep(5) + + # Switch to iframe + iframe = driver.find_element(By.CSS_SELECTOR, "#fillform-frame-1") + driver.switch_to.frame(iframe) - house_input = driver.find_element( - By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises" + wait_for_element( + driver, By.ID, "mandatory_Postcode", timeout=10 ) - house_input.send_keys(user_paon) postcode_input = driver.find_element( - By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoSearch" + By.ID, "Postcode" ) postcode_input.send_keys(user_postcode) - # submit address search - driver.find_element(By.ID, "butGeoSearch").send_keys(Keys.RETURN) + wait_for_element(driver, By.ID, "List") + time.sleep(2) + + WebDriverWait(driver, 10).until( + EC.element_to_be_clickable( + ( + By.XPATH, + "//select[@name='List']//option[contains(., '" + + user_paon + + "')]", + ) + ) + ).click() - wait_for_element( - driver, - By.ID, - "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor", - # submitting can be slow - timeout=30, - ) + time.sleep(10) - # Open the panel - driver.find_element( - By.ID, "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor" - ).click() + # For whatever reason, the page sometimes automatically goes to the next step + next_button = driver.find_element(By.XPATH, '/html/body/div/div/section/form/div/nav/div[2]/button') + if next_button.is_displayed(): + next_button.click() - # Domestic waste calendar - wait_for_element( - driver, By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar" - ) - calendar_link = driver.find_element( - By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar" - ) - driver.execute_script("arguments[0].click();", calendar_link) - # Recycling                      collection date 14 March 2024 Date: Thu, 19 Dec 2024 11:51:56 +0000 Subject: [PATCH 16/17] fix: Remove invalid escape sequence warnings from West Lindsey District Council --- .../councils/WestLindseyDistrictCouncil.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py index 4ba4068995..47f3061b9b 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py @@ -32,7 +32,7 @@ def parse_data(self, page: str, **kwargs) -> dict: # Strip data and parse the JSON address_data = json.loads( - re.sub("getAddressesCallback\d+\(", "", address_data)[:-2] + re.sub(r"getAddressesCallback\d+\(", "", address_data)[:-2] ) if address_data["TotalHits"] == 0: @@ -48,9 +48,7 @@ def parse_data(self, page: str, **kwargs) -> dict: address_x = address_data["Locations"][0]["X"] address_y = address_data["Locations"][0]["Y"] - stage2_url = "https://wlnk.statmap.co.uk/map/Cluster.svc/getpage?script=\Cluster\Cluster.AuroraScript$&taskId=bins&format=js&updateOnly=true&query=x%3D{}%3By%3D{}%3Bid%3D{}".format( - address_x, address_y, address_id - ) + stage2_url = fr"https://wlnk.statmap.co.uk/map/Cluster.svc/getpage?script=\Cluster\Cluster.AuroraScript$&taskId=bins&format=js&updateOnly=true&query=x%3D{address_x}%3By%3D{address_y}%3Bid%3D{address_id}" bin_query = requests.get(stage2_url).text @@ -61,7 +59,7 @@ def parse_data(self, page: str, **kwargs) -> dict: ) # Return only the HTML contained within the Javascript function payload. - pattern = 'document\.getElementById\("DR1"\)\.innerHTML="(.+)";' + pattern = r'document\.getElementById\("DR1"\)\.innerHTML="(.+)";' bin_html = re.findall(pattern, bin_query) @@ -86,7 +84,7 @@ def parse_data(self, page: str, **kwargs) -> dict: # Get bin date bin_date_text = row.text - pattern = "\d+\/\d+" + pattern = r"\d+\/\d+" bin_dates = re.findall(pattern, bin_date_text) for bin_date in bin_dates: From 6e610b4082df8701f4e3dfcb206ee54397ee72de Mon Sep 17 00:00:00 2001 From: David Park Date: Sun, 29 Dec 2024 16:41:01 +0000 Subject: [PATCH 17/17] fix: #1101 - Fix table parsing for Walsall Council --- .../councils/WalsallCouncil.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py index ca305cb68d..4361c7ed62 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py @@ -28,21 +28,25 @@ def parse_data(self, page: str, **kwargs) -> dict: response = requests.get(URI, headers=headers) soup = BeautifulSoup(response.text, "html.parser") - # Extract links to collection shedule pages and iterate through the pages - schedule_links = soup.findAll("a", {"class": "nav-link"}, href=True) + # Extract links to collection schedule pages and iterate through the pages + schedule_links = soup.findAll("td") + for item in schedule_links: - if "roundname" in item["href"]: + if "roundname" in item.contents[1]["href"]: # get bin colour - bincolour = item["href"].split("=")[-1].split("%")[0].upper() - binURL = "https://cag.walsall.gov.uk" + item["href"] - r = requests.get(binURL, headers=headers) + bin_colour = item.contents[1]["href"].split("=")[-1].split("%")[0].upper() + bin_url = "https://cag.walsall.gov.uk" + item.contents[1]["href"] + r = requests.get(bin_url, headers=headers) + if r.status_code != 200: + print(f"Collection details for {bin_colour.lower()} bin could not be retrieved.") + break soup = BeautifulSoup(r.text, "html.parser") table = soup.findAll("tr") for tr in table: td = tr.findAll("td") if td: dict_data = { - "type": bincolour, + "type": bin_colour.capitalize() + " bin", "collectionDate": datetime.strptime( td[1].text.strip(), "%d/%m/%Y" ).strftime("%d/%m/%Y"), @@ -50,7 +54,7 @@ def parse_data(self, page: str, **kwargs) -> dict: bindata["bins"].append(dict_data) bindata["bins"].sort( - key=lambda x: datetime.strptime(x.get("collectionDate"), "%d/%m/%Y") + key=lambda x: datetime.strptime(x.get("collectionDate"), date_format) ) return bindata