Merge pull request #886 from m26dvd/master

robbrad · Oct 15, 2024 · cdd2e97 · cdd2e97
2 parents 4e95ade + 50f3aab
commit cdd2e97
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 57 deletions.
diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
@@ -1128,7 +1128,8 @@
     "SwaleBoroughCouncil": {
         "postcode": "ME12 2NQ",
         "skip_get_url": true,
-        "uprn": "100061081168",
+        "house_number": "81",
+        "web_driver": "http://selenium:4444",
         "url": "https://swale.gov.uk/bins-littering-and-the-environment/bins/collection-days",
         "wiki_name": "Swale Borough Council"
     },

diff --git a/uk_bin_collection/uk_bin_collection/councils/BarnetCouncil.py b/uk_bin_collection/uk_bin_collection/councils/BarnetCouncil.py
@@ -74,13 +74,30 @@ def parse_data(self, page: str, **kwargs) -> dict:
 
             driver.get(page)
 
+            wait = WebDriverWait(driver, 10)
+            accept_cookies_button = wait.until(
+                EC.element_to_be_clickable(
+                    (
+                        By.XPATH,
+                        "//button[contains(text(), 'Accept additional cookies')]",
+                    )
+                )
+            )
+            accept_cookies_button.click()
+
             # Wait for the element to be clickable
-            find_your_collection_button = WebDriverWait(driver, 10).until(
+            wait = WebDriverWait(driver, 10)
+            find_your_collection_button = wait.until(
                 EC.element_to_be_clickable(
-                    (By.XPATH, '//a[contains(text(), "Find your household collection day")]')
+                    (By.LINK_TEXT, "Find your household collection day")
                 )
             )
 
+            # Scroll to the element (in case something is blocking it)
+            driver.execute_script(
+                "arguments[0].scrollIntoView();", find_your_collection_button
+            )
+
             # Click the element
             find_your_collection_button.click()
 
@@ -107,12 +124,12 @@ def parse_data(self, page: str, **kwargs) -> dict:
 
             postcode_input.send_keys(user_postcode)
 
-            find_address_button = WebDriverWait(driver, 10).until(
-                EC.presence_of_element_located(
-                    (By.CSS_SELECTOR, '[value="Find address"]')
-                )
+            find_address_button = WebDriverWait(driver, 30).until(
+                EC.element_to_be_clickable((By.CSS_SELECTOR, '[value="Find address"]'))
             )
-            find_address_button.click()
+            driver.execute_script("arguments[0].scrollIntoView();", find_address_button)
+            driver.execute_script("arguments[0].click();", find_address_button)
+            # find_address_button.click()
 
             time.sleep(15)
             # Wait for address box to be visible

diff --git a/uk_bin_collection/uk_bin_collection/councils/HaltonBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/HaltonBoroughCouncil.py
@@ -80,6 +80,10 @@ def parse_data(self, page: str, **kwargs) -> dict:
                 )
             )
             search_btn.send_keys(Keys.ENTER)
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.ID, "collectionTabs"))
+            )
+
             soup = BeautifulSoup(driver.page_source, features="html.parser")
 
             # Find all tab panels within the collectionTabs

diff --git a/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SwaleBoroughCouncil.py
@@ -1,9 +1,11 @@
-import requests
 from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+
 from uk_bin_collection.uk_bin_collection.common import *
 from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
 
-
 # import the wonderful Beautiful Soup and the URL grabber
 
 
@@ -17,36 +19,72 @@ class CouncilClass(AbstractGetBinDataClass):
     def parse_data(self, page: str, **kwargs) -> dict:
         # Get postcode and UPRN from kwargs
         user_postcode = kwargs.get("postcode")
-        user_uprn = kwargs.get("uprn")
+        user_paon = kwargs.get("paon")
+        web_driver = kwargs.get("web_driver")
+        headless = kwargs.get("headless")
         check_postcode(user_postcode)
-        check_uprn(user_uprn)
+        check_paon(user_paon)
 
         # Build URL to parse
-        council_url = f"https://swale.gov.uk/bins-littering-and-the-environment/bins/collection-days?postcode={user_postcode.replace(' ', '+')}&addresses={user_uprn}&address-submit="
+        council_url = "https://swale.gov.uk/bins-littering-and-the-environment/bins/my-collection-day"
+
+        # Create Selenium webdriver
+        driver = create_webdriver(web_driver, headless, None, __name__)
+        driver.get(council_url)
+
+        # Wait for the postcode field to appear then populate it
+        try:
+            inputElement_postcode = WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.ID, "q462406_q1"))
+            )
+            inputElement_postcode.send_keys(user_postcode)
+        except Exception:
+            print("Page failed to load. Probably due to Cloudflare robot check!")
+
+        # Click search button
+        findAddress = WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.ID, "form_email_462397_submit"))
+        )
+        driver.execute_script("arguments[0].click();", findAddress)
+
+        # Wait for the 'Select address' dropdown to appear and select option matching the house name/number
+        WebDriverWait(driver, 10).until(
+            EC.element_to_be_clickable(
+                (
+                    By.XPATH,
+                    "//select[@id='SBCYBDAddressList']//option[contains(., '"
+                    + user_paon
+                    + "')]",
+                )
+            )
+        ).click()
+
+        # Click search button
+        getBins = WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.ID, "form_email_462397_submit"))
+        )
+        driver.execute_script("arguments[0].click();", getBins)
+
+        BinTable = WebDriverWait(driver, 30).until(
+            EC.presence_of_element_located((By.ID, "SBC-YBD-Main"))
+        )
 
-        # Parse URL and read if connection successful
-        requests.packages.urllib3.disable_warnings()
-        response = requests.get(council_url, verify=False)
-        if response.status_code == 200:
-            soup = BeautifulSoup(response.text, features="html.parser")
-            soup.prettify()
-        else:
-            raise ConnectionAbortedError("Could not parse council website.")
+        soup = BeautifulSoup(driver.page_source, features="html.parser")
+        soup.prettify()
 
         data = {"bins": []}
 
         # Get the collection bullet points on the page and parse them
-        form_area = soup.find("form", {"class": "integration bin-lookup"})
-        collections = [
-            item.text.strip().split(",") for item in form_area.find_all("li")
-        ]
-        for c in collections:
-            bin_type = c[0].strip()
-            # temp_date = c[2].strip() + " " + str(datetime.now().year)
-            bin_date = datetime.strptime(
-                c[2].strip() + " " + str(datetime.now().year), "%d %B %Y"
-            ).strftime(date_format)
-            dict_data = {"type": bin_type, "collectionDate": bin_date}
-            data["bins"].append(dict_data)
+        nextCollections = soup.find("div", {"id": "nextCollections"})
+        for c in nextCollections:
+            collection = c.find_all("strong")
+            for bin in collection:
+                split = (bin.text).split(" on ")
+                bin_type = split[0]
+                bin_date = datetime.strptime(split[1], "%A %d %b %Y").strftime(
+                    "%d/%m/%Y"
+                )
+                dict_data = {"type": bin_type, "collectionDate": bin_date}
+                data["bins"].append(dict_data)
 
         return data
diff --git a/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/WestBerkshireCouncil.py
@@ -74,30 +74,51 @@ def parse_data(self, page: str, **kwargs) -> dict:
             soup = BeautifulSoup(driver.page_source, features="html.parser")
             soup.prettify()
 
-            rubbish_date = datetime.strptime(
-                " ".join(
-                    soup.find("div", {"id": "FINDYOURBINDAYS_RUBBISHDATE_OUTERDIV"})
-                    .get_text(strip=True)
-                    .split()[6:8]
-                ),
-                "%d %B",
-            ).replace(year=datetime.now().year)
-            recycling_date = datetime.strptime(
-                " ".join(
-                    soup.find("div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"})
-                    .get_text(strip=True)
-                    .split()[6:8]
-                ),
-                "%d %B",
-            ).replace(year=datetime.now().year)
-            food_date = datetime.strptime(
-                " ".join(
-                    soup.find("div", {"id": "FINDYOURBINDAYS_FOODWASTEDATE_OUTERDIV"})
-                    .get_text(strip=True)
-                    .split()[8:10]
-                ),
-                "%d %B",
-            ).replace(year=datetime.now().year)
+            rubbish_div = soup.find(
+                "div", {"id": "FINDYOURBINDAYS_RUBBISHDATE_OUTERDIV"}
+            )
+            try:
+                rubbish_date = rubbish_div.find_all("div")[2]
+                rubbish_date = datetime.strptime(
+                    rubbish_date.text,
+                    "%A %d %B",
+                ).replace(year=datetime.now().year)
+            except:
+                rubbish_date = rubbish_div.find_all("div")[3]
+                rubbish_date = datetime.strptime(
+                    rubbish_date.text,
+                    "%A %d %B",
+                ).replace(year=datetime.now().year)
+            recycling_div = soup.find(
+                "div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"}
+            )
+            try:
+                recycling_date = recycling_div.find_all("div")[2]
+                recycling_date = datetime.strptime(
+                    recycling_date.text,
+                    "%A %d %B",
+                ).replace(year=datetime.now().year)
+            except:
+                rubbish_date = recycling_div.find_all("div")[3]
+                rubbish_date = datetime.strptime(
+                    rubbish_date.text,
+                    "%A %d %B",
+                ).replace(year=datetime.now().year)
+            food_div = soup.find(
+                "div", {"id": "FINDYOURBINDAYS_RECYCLINGDATE_OUTERDIV"}
+            )
+            try:
+                food_date = food_div.find_all("div")[2]
+                food_date = datetime.strptime(
+                    food_date.text,
+                    "%A %d %B",
+                ).replace(year=datetime.now().year)
+            except:
+                food_date = food_div.find_all("div")[3]
+                food_date = datetime.strptime(
+                    food_date.text,
+                    "%A %d %B",
+                ).replace(year=datetime.now().year)
 
             if datetime.now().month == 12 and rubbish_date.month == 1:
                 rubbish_date = rubbish_date + relativedelta(years=1)