Merge pull request #524 from sammort/521-vale-of-white-horse

Add Vale of White Horse Council
robbrad · Jan 3, 2024 · 3b33ab7 · 3b33ab7
2 parents 2c08f5b + 433a71b
commit 3b33ab7
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 9 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -156,6 +156,7 @@ Useful functions include:
 - a function to check [if a date is a holiday](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L117) in a given part of the UK
 - a function that returns the [dates of a given weekday](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L136) in N amounts of weeks
 - a function that returns a [list of dates every N days](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L148) from a given start date
+- a function to check [if a string contains a date](./uk_bin_collection/uk_bin_collection/common.py#L249) (leverages [dateutil's parser](https://dateutil.readthedocs.io/en/stable/parser.html))
 
 `common.py` also contains a [standardised date format](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L11) variable called `date_format`, which is useful to call when formatting datetimes.
 

diff --git a/uk_bin_collection/tests/features/validate_council_outputs.feature b/uk_bin_collection/tests/features/validate_council_outputs.feature
@@ -118,6 +118,7 @@ Feature: Test each council output matches expected results
             | TorbayCouncil | None | None |
             | TorridgeDistrictCouncil | None | None |
             | ValeofGlamorganCouncil | None | None |
+            | ValeofWhiteHorseCouncil | None | None |
             | WakefieldCityCouncil | http://selenium:4444 | local |
             | WarwickDistrictCouncil | None | None |
             | WaverleyBoroughCouncil | None | None |

diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
@@ -776,6 +776,11 @@
         "url": "https://www.valeofglamorgan.gov.uk/en/living/Recycling-and-Waste/",
         "wiki_name": "Vale of Glamorgan Council"
     },
+    "ValeofWhiteHorseCouncil": {
+        "uprn": "100121391443",
+        "url": "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb",
+        "wiki_name": "Vale of White Horse Council"
+    },
     "WakefieldCityCouncil": {
         "custom_component_show_url_field": true,
         "skip_get_url": true,

diff --git a/uk_bin_collection/uk_bin_collection/common.py b/uk_bin_collection/uk_bin_collection/common.py
@@ -6,6 +6,7 @@
 import re
 import requests
 from datetime import datetime
+from dateutil.parser import parse
 from enum import Enum
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service as ChromeService
@@ -245,6 +246,20 @@ def validate_dates(bin_dates: dict) -> dict:
     raise NotImplementedError()
     # If a date is in December and the next is in January, increase the year
 
+def contains_date(string, fuzzy=False) -> bool:
+    """
+    Return whether the string can be interpreted as a date.
+
+    :param string: str, string to check for date
+    :param fuzzy: bool, ignore unknown tokens in string if True
+    """
+    try: 
+        parse(string, fuzzy=fuzzy)
+        return True
+
+    except ValueError:
+        return False
+
 
 def create_webdriver(web_driver) -> webdriver.Chrome:
     """

diff --git a/uk_bin_collection/uk_bin_collection/councils/SouthOxfordshireCouncil.py b/uk_bin_collection/uk_bin_collection/councils/SouthOxfordshireCouncil.py
@@ -56,16 +56,26 @@ def parse_data(self, page: str, **kwargs) -> dict:
 
         # Page has slider info side by side, which are two instances of this class
         for bin in soup.find_all("div", {"class": "binextra"}):
-            bin_info = bin.text.split("-")
+            bin_info = list(bin.stripped_strings)
             try:
-                # No date validation since year isn't included on webpage
-                bin_date = get_next_occurrence_from_day_month(
-                    datetime.strptime(
-                        bin_info[0].strip() + " " + datetime.today().strftime("%Y"),
-                        "%A %d %B %Y",
-                    )
-                ).strftime(date_format)
-                bin_type = str.capitalize(bin_info[1].strip())
+                # On standard collection schedule, date will be contained in the first stripped string
+                if contains_date(bin_info[0]):
+                    bin_date = get_next_occurrence_from_day_month(
+                        datetime.strptime(
+                            bin_info[0] + " " + datetime.today().strftime("%Y"),
+                            "%A %d %B - %Y",
+                        )
+                    ).strftime(date_format)
+                    bin_type = str.capitalize(bin_info[1])
+                # On exceptional collection schedule (e.g. around English Bank Holidays), date will be contained in the second stripped string
+                else:
+                    bin_date = get_next_occurrence_from_day_month(
+                        datetime.strptime(
+                            bin_info[1] + " " + datetime.today().strftime("%Y"),
+                            "%A %d %B - %Y",
+                        )
+                    ).strftime(date_format)
+                    bin_type = str.capitalize(bin_info[2])
             except Exception as ex:
                 raise ValueError(f"Error parsing bin data: {ex}")
 

diff --git a/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py b/uk_bin_collection/uk_bin_collection/councils/ValeofWhiteHorseCouncil.py
@@ -0,0 +1,91 @@
+import requests
+from bs4 import BeautifulSoup
+from uk_bin_collection.uk_bin_collection.common import *
+from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
+
+
+# import the wonderful Beautiful Soup and the URL grabber
+class CouncilClass(AbstractGetBinDataClass):
+    """
+    Concrete classes have to implement all abstract operations of the
+    base class. They can also override some operations with a default
+    implementation.
+    """
+
+    def parse_data(self, page: str, **kwargs) -> dict:
+        user_uprn = kwargs.get("uprn")
+        check_uprn(user_uprn)
+
+        # UPRN is passed in via a cookie. Set cookies/params and GET the page
+        cookies = {
+            "SVBINZONE": f"VALE%3AUPRN%40{user_uprn}",
+        }
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "en-GB,en;q=0.7",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Referer": "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb?SOVA_TAG=VALE&ebd=0&ebz=1_1704201201813",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "same-origin",
+            "Sec-Fetch-User": "?1",
+            "Sec-GPC": "1",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
+        }
+        params = {
+            "SOVA_TAG": "VALE",
+            "ebd": "0",
+        }
+        requests.packages.urllib3.disable_warnings()
+        response = requests.get(
+            "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb",
+            params=params,
+            headers=headers,
+            cookies=cookies,
+        )
+
+        # Parse response text for super speedy finding
+        soup = BeautifulSoup(response.text, features="html.parser")
+        soup.prettify()
+
+        data = {"bins": []}
+
+        # Page has slider info side by side, which are two instances of this class
+        for bin in soup.find_all("div", {"class": "binextra"}):
+            bin_info = list(bin.stripped_strings)
+            try:
+                # On standard collection schedule, date will be contained in the first stripped string
+                if contains_date(bin_info[0]):
+                    bin_date = get_next_occurrence_from_day_month(
+                        datetime.strptime(
+                            bin_info[0] + " " + datetime.today().strftime("%Y"),
+                            "%A %d %B - %Y",
+                        )
+                    ).strftime(date_format)
+                    bin_type = str.capitalize(bin_info[1])
+                # On exceptional collection schedule (e.g. around English Bank Holidays), date will be contained in the second stripped string
+                else:
+                    bin_date = get_next_occurrence_from_day_month(
+                        datetime.strptime(
+                            bin_info[1] + " " + datetime.today().strftime("%Y"),
+                            "%A %d %B - %Y",
+                        )
+                    ).strftime(date_format)
+                    bin_type = str.capitalize(bin_info[2])
+            except Exception as ex:
+                raise ValueError(f"Error parsing bin data: {ex}")
+
+            # Build data dict for each entry
+            dict_data = {
+                "type": bin_type,
+                "collectionDate": bin_date,
+            }
+            data["bins"].append(dict_data)
+
+        data["bins"].sort(
+            key=lambda x: datetime.strptime(x.get("collectionDate"), date_format)
+        )
+
+        return data