diff --git a/.github/workflows/behave_pull_request.yml b/.github/workflows/behave_pull_request.yml index cfb7d19ea4..c651248485 100644 --- a/.github/workflows/behave_pull_request.yml +++ b/.github/workflows/behave_pull_request.yml @@ -35,7 +35,6 @@ jobs: with: files: | uk_bin_collection/uk_bin_collection/councils/**.py - - name: Set Council Tests Environment Variable id: set-council-tests run: | @@ -50,7 +49,6 @@ jobs: fi done echo "council_tests=$COUNCIL_TESTS" >> $GITHUB_OUTPUT - outputs: council_tests: ${{ steps.set-council-tests.outputs.council_tests }} @@ -111,7 +109,6 @@ jobs: repo=${{ github.event.pull_request.head.repo.full_name || 'robbrad/UKBinCollectionData' }} branch=${{ github.event.pull_request.head.ref || 'master' }} make parity-check repo=$repo branch=$branch - integration-tests: name: Run Integration Tests needs: setup diff --git a/README.md b/README.md index 7072d77612..636ebcd139 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ If you miss this on the first setup you can reconfigure it. "color": "blue" } } - +``` --- ## Standalone Usage diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index d949cd4ec9..c5488f86f6 100755 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -1993,8 +1993,8 @@ "wiki_note": "You will need to use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find the UPRN." }, "WestLindseyDistrictCouncil": { - "house_number": "PRIVATE ACCOMMODATION", - "postcode": "LN8 2AR", + "house_number": "35", + "postcode": "LN8 3AX", "skip_get_url": true, "url": "https://www.west-lindsey.gov.uk/", "wiki_name": "West Lindsey District Council", diff --git a/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py index 93f03c3df1..e116b7d930 100644 --- a/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py @@ -21,7 +21,6 @@ class CouncilClass(AbstractGetBinDataClass): """ def parse_data(self, page: str, **kwargs) -> dict: - # Make a BS4 object driver = None try: bin_data_dict = {"bins": []} @@ -76,12 +75,13 @@ def parse_data(self, page: str, **kwargs) -> dict: # Get the current year current_year = datetime.now().year + # Append the year to the date + date_with_year = date_object.replace(year=current_year) + # Check if the parsed date is in the past compared to the current date if date_object < datetime.now(): # If the parsed date is in the past, assume it's for the next year current_year += 1 - # Append the year to the date - date_with_year = date_object.replace(year=current_year) # Format the date with the year date_with_year_formatted = date_with_year.strftime( diff --git a/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py b/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py index fd0a299e64..238c978f6c 100644 --- a/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/KirkleesCouncil.py @@ -1,12 +1,17 @@ +import time from datetime import datetime from typing import Optional +from bs4 import BeautifulSoup from selenium.common import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.webdriver import WebDriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait +from webdriver_manager.drivers.chrome import ChromeDriver + +from selenium import webdriver from uk_bin_collection.uk_bin_collection.common import create_webdriver from uk_bin_collection.uk_bin_collection.common import date_format @@ -55,78 +60,84 @@ def _parse_data(self, page: str, **kwargs) -> dict: - Extract info from the 'alt' attribute of the images on that page """ - bins = [] + data = {"bins": []} + collections = [] user_paon = kwargs["paon"] user_postcode = kwargs["postcode"] - self._driver = driver = create_webdriver( - web_driver=kwargs["web_driver"], - headless=kwargs.get("headless", True), - session_name=__name__, - ) + self._driver = driver = webdriver.Chrome() + # self._driver = driver = create_webdriver( + # web_driver=kwargs["web_driver"], + # headless=kwargs.get("headless", True), + # session_name=__name__, + # ) driver.implicitly_wait(1) driver.get( - "https://www.kirklees.gov.uk/beta/your-property-bins-recycling/your-bins/default.aspx" + "https://my.kirklees.gov.uk/service/Bins_and_recycling___Manage_your_bins" ) - wait_for_element( - driver, By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises" - ) + time.sleep(5) + + # Switch to iframe + iframe = driver.find_element(By.CSS_SELECTOR, "#fillform-frame-1") + driver.switch_to.frame(iframe) - house_input = driver.find_element( - By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoPremises" + wait_for_element( + driver, By.ID, "mandatory_Postcode", timeout=10 ) - house_input.send_keys(user_paon) postcode_input = driver.find_element( - By.ID, "cphPageBody_cphContent_thisGeoSearch_txtGeoSearch" + By.ID, "Postcode" ) postcode_input.send_keys(user_postcode) - # submit address search - driver.find_element(By.ID, "butGeoSearch").send_keys(Keys.RETURN) + wait_for_element(driver, By.ID, "List") + time.sleep(2) + + WebDriverWait(driver, 10).until( + EC.element_to_be_clickable( + ( + By.XPATH, + "//select[@name='List']//option[contains(., '" + + user_paon + + "')]", + ) + ) + ).click() - wait_for_element( - driver, - By.ID, - "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor", - # submitting can be slow - timeout=30, - ) + time.sleep(10) - # Open the panel - driver.find_element( - By.ID, "cphPageBody_cphContent_wtcDomestic240__lnkAccordionAnchor" - ).click() + # For whatever reason, the page sometimes automatically goes to the next step + next_button = driver.find_element(By.XPATH, '/html/body/div/div/section/form/div/nav/div[2]/button') + if next_button.is_displayed(): + next_button.click() - # Domestic waste calendar - wait_for_element( - driver, By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar" - ) - calendar_link = driver.find_element( - By.ID, "cphPageBody_cphContent_wtcDomestic240__LnkCalendar" - ) - driver.execute_script("arguments[0].click();", calendar_link) - # Recycling                      collection date 14 March 2024 dict: alternateCheck = False strong = soup.find_all("strong") + collections = [] if alternateCheck: bin_types = strong[2].text.strip().replace(".", "").split(" and ") for bin in bin_types: - dict_data = { - "type": bin, - "collectionDate": strong[1].text.strip(), - } - bindata["bins"].append(dict_data) + collections.append((bin.capitalize(), datetime.strptime(strong[1].text.strip(), date_format))) + else: p_tag = soup.find_all("p") i = 1 @@ -65,11 +63,18 @@ def parse_data(self, page: str, **kwargs) -> dict: p.text.split("Your ")[1].split(" is collected")[0].split(" and ") ) for bin in bin_types: - dict_data = { - "type": bin, - "collectionDate": strong[i].text.strip(), - } - bindata["bins"].append(dict_data) + collections.append((bin.capitalize(), datetime.strptime(strong[1].text.strip(), date_format))) i += 2 + if len(strong) > 3: + collections.append(("Garden", datetime.strptime(strong[4].text.strip(), date_format))) + + ordered_data = sorted(collections, key=lambda x: x[1]) + for item in ordered_data: + dict_data = { + "type": item[0] + " bin", + "collectionDate": item[1].strftime(date_format), + } + bindata["bins"].append(dict_data) + return bindata diff --git a/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py index f0e91b37aa..52ef26aa5a 100644 --- a/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/RugbyBoroughCouncil.py @@ -13,11 +13,6 @@ class CouncilClass(AbstractGetBinDataClass): def parse_data(self, page: str, **kwargs) -> dict: data = {"bins": []} - bin_types = { - "240L RUBBISH BIN": "Black bin", - "240L GARDEN BIN": "Green bin", - "180L RECYCLING BIN": "Blue lid bin", - } collections = [] user_postcode = kwargs.get("postcode") @@ -73,7 +68,8 @@ def parse_data(self, page: str, **kwargs) -> dict: for row in table_rows: row_text = row.text.strip().split("\n") - bin_type = bin_types.get(row_text[0]) + bin_text = row_text[0].split(" ") + bin_type = ' '.join(bin_text[1:]).capitalize() collections.append( (bin_type, datetime.strptime(row_text[1], "%A %d %b %Y")) ) diff --git a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py index 28202f9fa9..db9b2b33a9 100644 --- a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py @@ -6,8 +6,18 @@ from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass -# import the wonderful Beautiful Soup and the URL grabber +def parse_collection_date(date_string) -> datetime: + now = datetime.now() + if date_string == "is due today": + return now + + parsed_date = datetime.strptime(date_string, "%A, %d %B").replace(year=now.year) + + if now.month == 12 and parsed_date.month < 12: + parsed_date = parsed_date.replace(year=(now.year + 1)) + + return parsed_date class CouncilClass(AbstractGetBinDataClass): """ diff --git a/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py index ca305cb68d..4361c7ed62 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WalsallCouncil.py @@ -28,21 +28,25 @@ def parse_data(self, page: str, **kwargs) -> dict: response = requests.get(URI, headers=headers) soup = BeautifulSoup(response.text, "html.parser") - # Extract links to collection shedule pages and iterate through the pages - schedule_links = soup.findAll("a", {"class": "nav-link"}, href=True) + # Extract links to collection schedule pages and iterate through the pages + schedule_links = soup.findAll("td") + for item in schedule_links: - if "roundname" in item["href"]: + if "roundname" in item.contents[1]["href"]: # get bin colour - bincolour = item["href"].split("=")[-1].split("%")[0].upper() - binURL = "https://cag.walsall.gov.uk" + item["href"] - r = requests.get(binURL, headers=headers) + bin_colour = item.contents[1]["href"].split("=")[-1].split("%")[0].upper() + bin_url = "https://cag.walsall.gov.uk" + item.contents[1]["href"] + r = requests.get(bin_url, headers=headers) + if r.status_code != 200: + print(f"Collection details for {bin_colour.lower()} bin could not be retrieved.") + break soup = BeautifulSoup(r.text, "html.parser") table = soup.findAll("tr") for tr in table: td = tr.findAll("td") if td: dict_data = { - "type": bincolour, + "type": bin_colour.capitalize() + " bin", "collectionDate": datetime.strptime( td[1].text.strip(), "%d/%m/%Y" ).strftime("%d/%m/%Y"), @@ -50,7 +54,7 @@ def parse_data(self, page: str, **kwargs) -> dict: bindata["bins"].append(dict_data) bindata["bins"].sort( - key=lambda x: datetime.strptime(x.get("collectionDate"), "%d/%m/%Y") + key=lambda x: datetime.strptime(x.get("collectionDate"), date_format) ) return bindata diff --git a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py index 8693a13592..36908175f3 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py @@ -99,7 +99,7 @@ def parse_data(self, page: str, **kwargs) -> dict: ).replace(year=datetime.now().year) food_div = soup.find( - "div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"} + "div", {"id": "FINDYOURBINDAYS_FOODWASTEDATE_OUTERDIV"} ) food_date = food_div.find_all("div")[2] if food_date.text == "Today": diff --git a/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py index b4e9fd0408..3c1bb12f2e 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestLindseyDistrictCouncil.py @@ -32,7 +32,7 @@ def parse_data(self, page: str, **kwargs) -> dict: # Strip data and parse the JSON address_data = json.loads( - re.sub("getAddressesCallback\d+\(", "", address_data)[:-2] + re.sub(r"getAddressesCallback\d+\(", "", address_data)[:-2] ) if address_data["TotalHits"] == 0: @@ -48,9 +48,7 @@ def parse_data(self, page: str, **kwargs) -> dict: address_x = address_data["Locations"][0]["X"] address_y = address_data["Locations"][0]["Y"] - stage2_url = "https://wlnk.statmap.co.uk/map/Cluster.svc/getpage?script=\Cluster\Cluster.AuroraScript$&taskId=bins&format=js&updateOnly=true&query=x%3D{}%3By%3D{}%3Bid%3D{}".format( - address_x, address_y, address_id - ) + stage2_url = fr"https://wlnk.statmap.co.uk/map/Cluster.svc/getpage?script=\Cluster\Cluster.AuroraScript$&taskId=bins&format=js&updateOnly=true&query=x%3D{address_x}%3By%3D{address_y}%3Bid%3D{address_id}" bin_query = requests.get(stage2_url).text @@ -61,7 +59,7 @@ def parse_data(self, page: str, **kwargs) -> dict: ) # Return only the HTML contained within the Javascript function payload. - pattern = 'document\.getElementById\("DR1"\)\.innerHTML="(.+)";' + pattern = r'document\.getElementById\("DR1"\)\.innerHTML="(.+)";' bin_html = re.findall(pattern, bin_query) @@ -86,19 +84,20 @@ def parse_data(self, page: str, **kwargs) -> dict: # Get bin date bin_date_text = row.text - pattern = "\d+\/\d+" + pattern = r"\d+\/\d+" bin_dates = re.findall(pattern, bin_date_text) for bin_date in bin_dates: # Split the bin date into day and month and build a full date with the current year split_date = bin_date.split("/") + if len(split_date[0]) < 1: + raise ValueError("Error parsing dates retrieved from website") full_date = datetime( datetime.now().year, int(split_date[1]), int(split_date[0]) ) - # If the current month is December and one of the next collections is in January, increment the year - if datetime.now().month == 12 and int(split_date[1]) == 1: - full_date = bin_date + relativedelta(years=1) + if datetime.now().month == 12 and int(split_date[1]) < 12: + full_date = datetime(year=datetime.now().year + 1, month=int(split_date[1]), day=int(split_date[0])) # Since data in unordered, add to a tuple collections.append((bin_type.title(), full_date)) @@ -116,3 +115,4 @@ def parse_data(self, page: str, **kwargs) -> dict: data["bins"].append(dict_data) return data +