Skip to content

Commit

Permalink
fix: fix date parsing and change BS4 logic
Browse files Browse the repository at this point in the history
  • Loading branch information
dp247 committed Oct 14, 2023
1 parent 32bebe4 commit 03d6709
Showing 1 changed file with 34 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# This script pulls (in one hit) the data from Bromley Council Bins Data
import dateutil.parser
import datetime
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
from uk_bin_collection.uk_bin_collection.common import *
from uk_bin_collection.uk_bin_collection.get_bin_data import \
Expand All @@ -20,27 +21,38 @@ def parse_data(self, page: str, **kwargs) -> dict:
soup.prettify()

bin_data_dict = {"bins": []}
collections = []


# Search for the specific bins in the table using BS4
bin_types = soup.find_all("h3", class_="govuk-heading-m waste-service-name")
collection_info = soup.find_all("dl", {"class": "govuk-summary-list"})

# Raise error if data is not loaded at time of scrape (30% chance it is)
if len(bin_types) == 0:
raise ConnectionError("Error fetching council data: data absent when page was scraped.")

# Parse the data
for idx, value in enumerate(collection_info):
bin_type = bin_types[idx].text.strip()
collection_date = value.contents[3].contents[3].text.strip()
next_collection = datetime.strptime(remove_ordinal_indicator_from_date_string(collection_date.replace(',', '')), "%A %d %B")
curr_date = datetime.now().date()
next_collection = next_collection.replace(year=curr_date.year)
if curr_date.month == 12 and next_collection.month == 1:
next_collection = next_collection + relativedelta(years=1)
collections.append((bin_type, next_collection))

# Sort the text and list elements by date
ordered_data = sorted(collections, key=lambda x: x[1])

# Put the elements into the dictionary
for item in ordered_data:
dict_data = {
"type": item[0],
"collectionDate": item[1].strftime(date_format),
}
bin_data_dict["bins"].append(dict_data)

# Search for the specific bin in the table using BS4
rows = soup.find("div", class_=("waste__collections")).find_all(
"h3",
class_=("waste-service-name",),
)

# Loops the Rows
for row in rows:
bin_type = row.get_text().strip()
collectionDate = row.find_all_next(
"dd", {"class": "govuk-summary-list__value"}
)
# Make each Bin element in the JSON, but only if we have a date available
if collectionDate:
date = dateutil.parser.parse(collectionDate[1].text.strip())
dict_data = {
"type": bin_type,
"collectionDate": date.strftime(date_format),
}
# Add data to the main JSON Wrapper
bin_data_dict["bins"].append(dict_data)

return bin_data_dict

0 comments on commit 03d6709

Please sign in to comment.