From 167c15cae914d7ff04e2dbdf6793d5ed25618c55 Mon Sep 17 00:00:00 2001 From: Simon Drake Date: Thu, 7 Dec 2023 13:13:16 +0000 Subject: [PATCH] feat: Add BefordshireCouncil scraper --- .../features/validate_council_outputs.feature | 1 + uk_bin_collection/tests/input.json | 11 ++- .../councils/BedfordshireCouncil.py | 67 +++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 uk_bin_collection/uk_bin_collection/councils/BedfordshireCouncil.py diff --git a/uk_bin_collection/tests/features/validate_council_outputs.feature b/uk_bin_collection/tests/features/validate_council_outputs.feature index e49f801fe9..c32b0885a9 100644 --- a/uk_bin_collection/tests/features/validate_council_outputs.feature +++ b/uk_bin_collection/tests/features/validate_council_outputs.feature @@ -13,6 +13,7 @@ Feature: Test each council output matches expected results | BasingstokeCouncil | None | None | | BathAndNorthEastSomersetCouncil | None | None | | BCPCouncil | None | None | + | BedfordshireCouncil | None | None | | BexleyCouncil | None | None | | BlackburnCouncil | None | None | | BoltonCouncil | None | None | diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index f86c5db254..37f26c3da4 100644 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -32,6 +32,12 @@ "url": "https://www.bathnes.gov.uk/webforms/waste/collectionday/", "wiki_name": "Bath and North East Somerset Council" }, + "BedfordshireCouncil": { + "skip_get_url": true, + "url": "https://www.centralbedfordshire.gov.uk/info/163/bins_and_waste_collections_-_check_bin_collection_day", + "wiki_name": "Bedfordshire Council", + "wiki_note": "In order to use this parser, you must provide a valid postcode and a uprn retrieved from the councils website for your specific address" + }, "BexleyCouncil": { "skip_get_url": true, "uprn": "spamstorage@live.co.uk", @@ -393,7 +399,7 @@ "wiki_command_url_override": "https://community.newcastle.gov.uk/my-neighbourhood/ajax/getBinsNew.php?uprn=XXXXXXXX", "wiki_name": "Newcastle City Council", "wiki_note": "Replace XXXXXXXX with UPRN." - }, + }, "NorthEastDerbyshireDistrictCouncil": { "skip_get_url": true, "uprn": "010034492221", @@ -750,4 +756,5 @@ "url": "https://waste-api.york.gov.uk/api/Collections/GetBinCollectionDataForUprn/", "wiki_name": "York Council" } -} \ No newline at end of file +} + diff --git a/uk_bin_collection/uk_bin_collection/councils/BedfordshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BedfordshireCouncil.py new file mode 100644 index 0000000000..15aeb7ef56 --- /dev/null +++ b/uk_bin_collection/uk_bin_collection/councils/BedfordshireCouncil.py @@ -0,0 +1,67 @@ +from datetime import datetime + +import requests +from bs4 import BeautifulSoup +from uk_bin_collection.uk_bin_collection.common import * +from uk_bin_collection.uk_bin_collection.get_bin_data import \ + AbstractGetBinDataClass + + +class CouncilClass(AbstractGetBinDataClass): + """ + Concrete classes have to implement all abstract operations of the + base class. They can also override some operations with a default + implementation. + """ + + def parse_data(self, page: str, **kwargs) -> dict: + user_uprn = kwargs.get("uprn") + user_postcode = kwargs.get("postcode") + + check_uprn(user_uprn) + check_postcode(user_postcode) + + # Start a new session to walk through the form + requests.packages.urllib3.disable_warnings() + s = requests.session() + + headers = { + 'Origin': 'https://www.centralbedfordshire.gov.uk', + 'Referer': 'https://www.centralbedfordshire.gov.uk/info/163/bins_and_waste_collections_-_check_bin_collection_day', + } + + files = { + 'postcode': (None, user_postcode), + 'address': (None, user_uprn), + } + + response = requests.post( + 'https://www.centralbedfordshire.gov.uk/info/163/bins_and_waste_collections_-_check_bin_collection_day#my_bin_collections', + headers=headers, + files=files, + ) + + # Make that BS4 object and use it to prettify the response + soup = BeautifulSoup(response.content, features="html.parser") + soup.prettify() + + collections_div = soup.find(id="collections") + + # Get the collection items on the page and strip the bits of text that we don't care for + collections = [] + for bin in collections_div.find_all("h3"): + bin_type = bin.find_next("br").next_sibling + collection_date = datetime.strptime(bin.text, "%A, %d %B %Y") + collections.append((bin_type, collection_date)) + + # Sort the collections by date order rather than bin type, then return as a dictionary (with str date) + ordered_data = sorted(collections, key=lambda x: x[1]) + data = {"bins": []} + for item in ordered_data: + dict_data = { + "type": item[0], + "collectionDate": item[1].strftime(date_format), + } + data["bins"].append(dict_data) + + return data