Skip to content

Commit

Permalink
Merge pull request #524 from sammort/521-vale-of-white-horse
Browse files Browse the repository at this point in the history
Add Vale of White Horse Council
  • Loading branch information
robbrad authored Jan 3, 2024
2 parents 2c08f5b + 433a71b commit 3b33ab7
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 9 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Useful functions include:
- a function to check [if a date is a holiday](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L117) in a given part of the UK
- a function that returns the [dates of a given weekday](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L136) in N amounts of weeks
- a function that returns a [list of dates every N days](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L148) from a given start date
- a function to check [if a string contains a date](./uk_bin_collection/uk_bin_collection/common.py#L249) (leverages [dateutil's parser](https://dateutil.readthedocs.io/en/stable/parser.html))

`common.py` also contains a [standardised date format](https://github.com/robbrad/UKBinCollectionData/blob/e49da2f43143ac7c65fbeaf35b5e86b3ea19e31b/uk_bin_collection/uk_bin_collection/common.py#L11) variable called `date_format`, which is useful to call when formatting datetimes.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ Feature: Test each council output matches expected results
| TorbayCouncil | None | None |
| TorridgeDistrictCouncil | None | None |
| ValeofGlamorganCouncil | None | None |
| ValeofWhiteHorseCouncil | None | None |
| WakefieldCityCouncil | http://selenium:4444 | local |
| WarwickDistrictCouncil | None | None |
| WaverleyBoroughCouncil | None | None |
Expand Down
5 changes: 5 additions & 0 deletions uk_bin_collection/tests/input.json
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,11 @@
"url": "https://www.valeofglamorgan.gov.uk/en/living/Recycling-and-Waste/",
"wiki_name": "Vale of Glamorgan Council"
},
"ValeofWhiteHorseCouncil": {
"uprn": "100121391443",
"url": "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb",
"wiki_name": "Vale of White Horse Council"
},
"WakefieldCityCouncil": {
"custom_component_show_url_field": true,
"skip_get_url": true,
Expand Down
15 changes: 15 additions & 0 deletions uk_bin_collection/uk_bin_collection/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import re
import requests
from datetime import datetime
from dateutil.parser import parse
from enum import Enum
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
Expand Down Expand Up @@ -245,6 +246,20 @@ def validate_dates(bin_dates: dict) -> dict:
raise NotImplementedError()
# If a date is in December and the next is in January, increase the year

def contains_date(string, fuzzy=False) -> bool:
"""
Return whether the string can be interpreted as a date.
:param string: str, string to check for date
:param fuzzy: bool, ignore unknown tokens in string if True
"""
try:
parse(string, fuzzy=fuzzy)
return True

except ValueError:
return False


def create_webdriver(web_driver) -> webdriver.Chrome:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,26 @@ def parse_data(self, page: str, **kwargs) -> dict:

# Page has slider info side by side, which are two instances of this class
for bin in soup.find_all("div", {"class": "binextra"}):
bin_info = bin.text.split("-")
bin_info = list(bin.stripped_strings)
try:
# No date validation since year isn't included on webpage
bin_date = get_next_occurrence_from_day_month(
datetime.strptime(
bin_info[0].strip() + " " + datetime.today().strftime("%Y"),
"%A %d %B %Y",
)
).strftime(date_format)
bin_type = str.capitalize(bin_info[1].strip())
# On standard collection schedule, date will be contained in the first stripped string
if contains_date(bin_info[0]):
bin_date = get_next_occurrence_from_day_month(
datetime.strptime(
bin_info[0] + " " + datetime.today().strftime("%Y"),
"%A %d %B - %Y",
)
).strftime(date_format)
bin_type = str.capitalize(bin_info[1])
# On exceptional collection schedule (e.g. around English Bank Holidays), date will be contained in the second stripped string
else:
bin_date = get_next_occurrence_from_day_month(
datetime.strptime(
bin_info[1] + " " + datetime.today().strftime("%Y"),
"%A %d %B - %Y",
)
).strftime(date_format)
bin_type = str.capitalize(bin_info[2])
except Exception as ex:
raise ValueError(f"Error parsing bin data: {ex}")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import requests
from bs4 import BeautifulSoup
from uk_bin_collection.uk_bin_collection.common import *
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass


# import the wonderful Beautiful Soup and the URL grabber
class CouncilClass(AbstractGetBinDataClass):
"""
Concrete classes have to implement all abstract operations of the
base class. They can also override some operations with a default
implementation.
"""

def parse_data(self, page: str, **kwargs) -> dict:
user_uprn = kwargs.get("uprn")
check_uprn(user_uprn)

# UPRN is passed in via a cookie. Set cookies/params and GET the page
cookies = {
"SVBINZONE": f"VALE%3AUPRN%40{user_uprn}",
}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.7",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Referer": "https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb?SOVA_TAG=VALE&ebd=0&ebz=1_1704201201813",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
}
params = {
"SOVA_TAG": "VALE",
"ebd": "0",
}
requests.packages.urllib3.disable_warnings()
response = requests.get(
"https://eform.whitehorsedc.gov.uk/ebase/BINZONE_DESKTOP.eb",
params=params,
headers=headers,
cookies=cookies,
)

# Parse response text for super speedy finding
soup = BeautifulSoup(response.text, features="html.parser")
soup.prettify()

data = {"bins": []}

# Page has slider info side by side, which are two instances of this class
for bin in soup.find_all("div", {"class": "binextra"}):
bin_info = list(bin.stripped_strings)
try:
# On standard collection schedule, date will be contained in the first stripped string
if contains_date(bin_info[0]):
bin_date = get_next_occurrence_from_day_month(
datetime.strptime(
bin_info[0] + " " + datetime.today().strftime("%Y"),
"%A %d %B - %Y",
)
).strftime(date_format)
bin_type = str.capitalize(bin_info[1])
# On exceptional collection schedule (e.g. around English Bank Holidays), date will be contained in the second stripped string
else:
bin_date = get_next_occurrence_from_day_month(
datetime.strptime(
bin_info[1] + " " + datetime.today().strftime("%Y"),
"%A %d %B - %Y",
)
).strftime(date_format)
bin_type = str.capitalize(bin_info[2])
except Exception as ex:
raise ValueError(f"Error parsing bin data: {ex}")

# Build data dict for each entry
dict_data = {
"type": bin_type,
"collectionDate": bin_date,
}
data["bins"].append(dict_data)

data["bins"].sort(
key=lambda x: datetime.strptime(x.get("collectionDate"), date_format)
)

return data

0 comments on commit 3b33ab7

Please sign in to comment.