From c4f81f588a87714fd59942aac68ff318265499f7 Mon Sep 17 00:00:00 2001 From: Robert Bradley Date: Thu, 16 May 2024 08:17:00 +0100 Subject: [PATCH] fix: #709 Update DoverDistrictCouncil.py --- .../councils/DoverDistrictCouncil.py | 80 +++++++++---------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/uk_bin_collection/uk_bin_collection/councils/DoverDistrictCouncil.py b/uk_bin_collection/uk_bin_collection/councils/DoverDistrictCouncil.py index abc8a2c5d5..229d010617 100644 --- a/uk_bin_collection/uk_bin_collection/councils/DoverDistrictCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/DoverDistrictCouncil.py @@ -1,49 +1,41 @@ from bs4 import BeautifulSoup -from uk_bin_collection.uk_bin_collection.common import * +from datetime import datetime +import re +from uk_bin_collection.uk_bin_collection.common import * # Consider specific imports from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass - -# import the wonderful Beautiful Soup and the URL grabber class CouncilClass(AbstractGetBinDataClass): - """ - Concrete classes have to implement all abstract operations of the - base class. They can also override some operations with a default - implementation. - """ - def parse_data(self, page: str, **kwargs) -> dict: - # Make a BS4 object - soup = BeautifulSoup(page.text, features="html.parser") - soup.prettify() - - data = {"bins": []} - collections = [] - - bins = soup.find("div", {"class": "results-table-wrapper"}).find_all( - "div", {"class": "service-wrapper"} - ) - for bin in bins: - bin_type = ( - bin.find("h3", {"class": "service-name"}) - .get_text() - .replace("Collection", "bin") - .strip() - ) - bin_date = datetime.strptime( - bin.find("td", {"class": "next-service"}) - .find("span", {"class": "table-label"}) - .next_sibling.get_text() - .strip(), - "%d/%m/%Y", - ) - collections.append((bin_type, bin_date)) - - ordered_data = sorted(collections, key=lambda x: x[1]) - for item in ordered_data: - dict_data = { - "type": item[0].capitalize(), - "collectionDate": item[1].strftime(date_format), - } - data["bins"].append(dict_data) - - return data + soup = BeautifulSoup(page.text, 'html.parser') + + bins_data = {"bins": []} + bin_collections = [] + + results_wrapper = soup.find("div", {"class": "results-table-wrapper"}) + if not results_wrapper: + return bins_data # Return empty if the results wrapper is not found + + bins = results_wrapper.find_all("div", {"class": "service-wrapper"}) + for bin_item in bins: + service_name = bin_item.find("h3", {"class": "service-name"}) + next_service = bin_item.find("td", {"class": "next-service"}) + + if service_name and next_service: + bin_type = service_name.get_text().replace("Collection", "bin").strip() + date_span = next_service.find("span", {"class": "table-label"}) + date_text = date_span.next_sibling.get_text().strip() if date_span else None + + if date_text and re.match(r"\d{2}/\d{2}/\d{4}", date_text): + try: + bin_date = datetime.strptime(date_text, "%d/%m/%Y") + bin_collections.append((bin_type, bin_date)) + except ValueError: + continue + + for bin_type, bin_date in sorted(bin_collections, key=lambda x: x[1]): + bins_data["bins"].append({ + "type": bin_type.capitalize(), + "collectionDate": bin_date.strftime("%d/%m/%Y"), + }) + + return bins_data