diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index 6b1927d4d8..04b57a71ff 100644 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -1128,7 +1128,8 @@ "SwaleBoroughCouncil": { "postcode": "ME12 2NQ", "skip_get_url": true, - "uprn": "100061081168", + "house_number": "81", + "web_driver": "http://selenium:4444", "url": "https://swale.gov.uk/bins-littering-and-the-environment/bins/collection-days", "wiki_name": "Swale Borough Council" }, diff --git a/uk_bin_collection/uk_bin_collection/councils/BarnetCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BarnetCouncil.py index 2f936407fa..ae31628afe 100644 --- a/uk_bin_collection/uk_bin_collection/councils/BarnetCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/BarnetCouncil.py @@ -74,13 +74,30 @@ def parse_data(self, page: str, **kwargs) -> dict: driver.get(page) + wait = WebDriverWait(driver, 10) + accept_cookies_button = wait.until( + EC.element_to_be_clickable( + ( + By.XPATH, + "//button[contains(text(), 'Accept additional cookies')]", + ) + ) + ) + accept_cookies_button.click() + # Wait for the element to be clickable - find_your_collection_button = WebDriverWait(driver, 10).until( + wait = WebDriverWait(driver, 10) + find_your_collection_button = wait.until( EC.element_to_be_clickable( - (By.XPATH, '//a[contains(text(), "Find your household collection day")]') + (By.LINK_TEXT, "Find your household collection day") ) ) + # Scroll to the element (in case something is blocking it) + driver.execute_script( + "arguments[0].scrollIntoView();", find_your_collection_button + ) + # Click the element find_your_collection_button.click() @@ -107,12 +124,12 @@ def parse_data(self, page: str, **kwargs) -> dict: postcode_input.send_keys(user_postcode) - find_address_button = WebDriverWait(driver, 10).until( - EC.presence_of_element_located( - (By.CSS_SELECTOR, '[value="Find address"]') - ) + find_address_button = WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.CSS_SELECTOR, '[value="Find address"]')) ) - find_address_button.click() + driver.execute_script("arguments[0].scrollIntoView();", find_address_button) + driver.execute_script("arguments[0].click();", find_address_button) + # find_address_button.click() time.sleep(15) # Wait for address box to be visible diff --git a/uk_bin_collection/uk_bin_collection/councils/HaltonBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/HaltonBoroughCouncil.py index f76e02ee7f..946611b67b 100644 --- a/uk_bin_collection/uk_bin_collection/councils/HaltonBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/HaltonBoroughCouncil.py @@ -80,6 +80,10 @@ def parse_data(self, page: str, **kwargs) -> dict: ) ) search_btn.send_keys(Keys.ENTER) + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, "collectionTabs")) + ) + soup = BeautifulSoup(driver.page_source, features="html.parser") # Find all tab panels within the collectionTabs diff --git a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py index 215bed9b18..3d607961cb 100644 --- a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py @@ -1,9 +1,11 @@ -import requests from bs4 import BeautifulSoup +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait + from uk_bin_collection.uk_bin_collection.common import * from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass - # import the wonderful Beautiful Soup and the URL grabber @@ -17,36 +19,72 @@ class CouncilClass(AbstractGetBinDataClass): def parse_data(self, page: str, **kwargs) -> dict: # Get postcode and UPRN from kwargs user_postcode = kwargs.get("postcode") - user_uprn = kwargs.get("uprn") + user_paon = kwargs.get("paon") + web_driver = kwargs.get("web_driver") + headless = kwargs.get("headless") check_postcode(user_postcode) - check_uprn(user_uprn) + check_paon(user_paon) # Build URL to parse - council_url = f"https://swale.gov.uk/bins-littering-and-the-environment/bins/collection-days?postcode={user_postcode.replace(' ', '+')}&addresses={user_uprn}&address-submit=" + council_url = "https://swale.gov.uk/bins-littering-and-the-environment/bins/my-collection-day" + + # Create Selenium webdriver + driver = create_webdriver(web_driver, headless, None, __name__) + driver.get(council_url) + + # Wait for the postcode field to appear then populate it + try: + inputElement_postcode = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, "q462406_q1")) + ) + inputElement_postcode.send_keys(user_postcode) + except Exception: + print("Page failed to load. Probably due to Cloudflare robot check!") + + # Click search button + findAddress = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, "form_email_462397_submit")) + ) + driver.execute_script("arguments[0].click();", findAddress) + + # Wait for the 'Select address' dropdown to appear and select option matching the house name/number + WebDriverWait(driver, 10).until( + EC.element_to_be_clickable( + ( + By.XPATH, + "//select[@id='SBCYBDAddressList']//option[contains(., '" + + user_paon + + "')]", + ) + ) + ).click() + + # Click search button + getBins = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, "form_email_462397_submit")) + ) + driver.execute_script("arguments[0].click();", getBins) + + BinTable = WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.ID, "SBC-YBD-Main")) + ) - # Parse URL and read if connection successful - requests.packages.urllib3.disable_warnings() - response = requests.get(council_url, verify=False) - if response.status_code == 200: - soup = BeautifulSoup(response.text, features="html.parser") - soup.prettify() - else: - raise ConnectionAbortedError("Could not parse council website.") + soup = BeautifulSoup(driver.page_source, features="html.parser") + soup.prettify() data = {"bins": []} # Get the collection bullet points on the page and parse them - form_area = soup.find("form", {"class": "integration bin-lookup"}) - collections = [ - item.text.strip().split(",") for item in form_area.find_all("li") - ] - for c in collections: - bin_type = c[0].strip() - # temp_date = c[2].strip() + " " + str(datetime.now().year) - bin_date = datetime.strptime( - c[2].strip() + " " + str(datetime.now().year), "%d %B %Y" - ).strftime(date_format) - dict_data = {"type": bin_type, "collectionDate": bin_date} - data["bins"].append(dict_data) + nextCollections = soup.find("div", {"id": "nextCollections"}) + for c in nextCollections: + collection = c.find_all("strong") + for bin in collection: + split = (bin.text).split(" on ") + bin_type = split[0] + bin_date = datetime.strptime(split[1], "%A %d %b %Y").strftime( + "%d/%m/%Y" + ) + dict_data = {"type": bin_type, "collectionDate": bin_date} + data["bins"].append(dict_data) return data diff --git a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py index e52612bfe1..a6d8914034 100644 --- a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py +++ b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py @@ -74,30 +74,51 @@ def parse_data(self, page: str, **kwargs) -> dict: soup = BeautifulSoup(driver.page_source, features="html.parser") soup.prettify() - rubbish_date = datetime.strptime( - " ".join( - soup.find("div", {"id": "FINDYOURBINDAYS_RUBBISHDATE_OUTERDIV"}) - .get_text(strip=True) - .split()[6:8] - ), - "%d %B", - ).replace(year=datetime.now().year) - recycling_date = datetime.strptime( - " ".join( - soup.find("div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"}) - .get_text(strip=True) - .split()[6:8] - ), - "%d %B", - ).replace(year=datetime.now().year) - food_date = datetime.strptime( - " ".join( - soup.find("div", {"id": "FINDYOURBINDAYS_FOODWASTEDATE_OUTERDIV"}) - .get_text(strip=True) - .split()[8:10] - ), - "%d %B", - ).replace(year=datetime.now().year) + rubbish_div = soup.find( + "div", {"id": "FINDYOURBINDAYS_RUBBISHDATE_OUTERDIV"} + ) + try: + rubbish_date = rubbish_div.find_all("div")[2] + rubbish_date = datetime.strptime( + rubbish_date.text, + "%A %d %B", + ).replace(year=datetime.now().year) + except: + rubbish_date = rubbish_div.find_all("div")[3] + rubbish_date = datetime.strptime( + rubbish_date.text, + "%A %d %B", + ).replace(year=datetime.now().year) + recycling_div = soup.find( + "div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"} + ) + try: + recycling_date = recycling_div.find_all("div")[2] + recycling_date = datetime.strptime( + recycling_date.text, + "%A %d %B", + ).replace(year=datetime.now().year) + except: + rubbish_date = recycling_div.find_all("div")[3] + rubbish_date = datetime.strptime( + rubbish_date.text, + "%A %d %B", + ).replace(year=datetime.now().year) + food_div = soup.find( + "div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"} + ) + try: + food_date = food_div.find_all("div")[2] + food_date = datetime.strptime( + food_date.text, + "%A %d %B", + ).replace(year=datetime.now().year) + except: + food_date = food_div.find_all("div")[3] + food_date = datetime.strptime( + food_date.text, + "%A %d %B", + ).replace(year=datetime.now().year) if datetime.now().month == 12 and rubbish_date.month == 1: rubbish_date = rubbish_date + relativedelta(years=1)