From 43292b8cf4c5479bda23ed482b0818dac24cd034 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Wed, 27 Apr 2016 09:18:59 -0400 Subject: [PATCH] #64: Change time format - Events --- README.md | 4 +- uoftscrapers/scrapers/events/__init__.py | 90 +++++++++++++++++------- 2 files changed, 65 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 875071b..e0cc22a 100644 --- a/README.md +++ b/README.md @@ -611,8 +611,8 @@ https://www.events.utoronto.ca/ title: String, start_date: String end_date: String, - start_time: String, - end_time: String, + start_time: Number, + end_time: Number, url: String, description: String, admission_price: String, diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index 1ef7b2c..af4040a 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -1,5 +1,5 @@ from ..utils import Scraper -from bs4 import BeautifulSoup, NavigableString +from bs4 import BeautifulSoup, NavigableString, Comment from collections import OrderedDict from datetime import datetime, date from urllib.parse import urlencode @@ -11,7 +11,8 @@ class Events: """A scraper for Events at the University of Toronto.""" host = 'https://www.events.utoronto.ca/' - campuses_tags = {'St. George': 'UTSG', 'U of T Mississauga': 'UTM', 'U of T Scarborough': 'UTSC'} + campuses_tags = {'St. George': 'UTSG', 'U of T Mississauga': 'UTM', + 'U of T Scarborough': 'UTSC'} @staticmethod def scrape(location='.'): @@ -46,6 +47,41 @@ def get_events_links(): return events_links + @staticmethod + def convert_time(time_str): + hour_tks = time_str[:-2].split(':') + meridiem = time_str[-2:] + hours = int(hour_tks[0]) + minutes = 0 + if len(hour_tks) > 1: + minutes = int(hour_tks[1]) + if (meridiem == 'pm'): + if (int(hours) != 12): + hours = int(hours) + 12 + posix_from_midnight = hours*60*60 + minutes*60 + return posix_from_midnight + + @staticmethod + def normalize_text_sections(div): + paragraph = '' + for content in div.contents: + text = '' + if type(content) == NavigableString: + text = content + elif type(content) == Comment: + pass + elif content.name == 'li': + text = content.text + else: + text = content.text + text = text.strip() + paragraph += text.strip() + ' ' + paragraph = paragraph.strip() + paragraph = paragraph.replace('\r', '') + paragraph = paragraph.replace('\n', ', ') + paragraph = paragraph.strip() + return paragraph + @staticmethod def get_event_doc(url_tail): event_url = Events.host + url_tail @@ -63,16 +99,22 @@ def get_event_doc(url_tail): # Some of the strings are misformed and gives an extra empty space time_arr = list(filter(None, time_arr)) - event_start_date = datetime.strptime(date_arr[0], - '%b %d').replace(year=date.today().year).date().isoformat() - event_end_date = datetime.strptime(date_arr[-1], - '%b %d').replace(year=date.today().year).date().isoformat() - - # Note: Some events span across several days e.g. 8350, thus specifying - # dates makes no sense - event_meridiem = time_arr[2] - event_start_time = time_arr[0] + ' ' + event_meridiem - event_end_time = time_arr[1] + ' ' + event_meridiem + event_start_date = datetime.strptime(date_arr[0], '%b %d') + event_start_date = event_start_date.replace( + year=date.today().year).date().isoformat() + event_end_date = datetime.strptime(date_arr[-1], '%b %d') + event_end_date = event_end_date.replace( + year=date.today().year).date().isoformat() + + event_start_str = time_arr[0] + event_end_str = time_arr[-2] + time_arr[-1] + if (len(time_arr) == 3): + event_start_str += time_arr[-1] + else: + event_start_str += time_arr[1] + + event_start_time = Events.convert_time(event_start_str) + event_end_time = Events.convert_time(event_end_str) evt_bar = soup.select('#evt_bar')[0] event_url = evt_bar.select('dd')[1].a['href'] @@ -87,27 +129,20 @@ def get_event_doc(url_tail): address_block = evt_bar.select('dd')[0] if address_block.a is not None: address_block = address_block.a - for content in address_block.contents: - text = content if type( - content) == NavigableString else content.text - event_address += text.strip().replace('\r', '') + ' ' - event_address = event_address.strip() + event_address = Events.normalize_text_sections(address_block) - event_audiences = list(map(lambda a: a.text, - evt_bar.select('dl')[1].select('dd')[1].select('a'))) + event_audiences = list( + map(lambda a: a.text, evt_bar.select( + 'dl')[1].select('dd')[1].select('a'))) soup.select('.eventTitle')[0].extract() soup.select('.date')[0].extract() evt_bar.extract() soup.select('#cal_bar')[0].extract() - event_description = '' - for content in soup.select('#content')[0].contents: - text = content if type( - content) == NavigableString else content.text - event_description += text.strip().replace('\r', '') + ' ' - event_description = event_description.strip() - - return OrderedDict([ + event_description = Events.normalize_text_sections( + soup.select('#content')[0]) + + doc = OrderedDict([ ('id', event_id), ('title', event_title), ('start_date', event_start_date), @@ -121,3 +156,4 @@ def get_event_doc(url_tail): ('location', event_address), ('audiences', event_audiences) ]) + return doc