From b160630a71afdeb7b6061eb6d62f350562b8a602 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 19:15:51 -0400 Subject: [PATCH 1/8] Merge data for each campus --- uoftscrapers/scrapers/athletics/__init__.py | 28 +++++++++++++++++---- uoftscrapers/scrapers/athletics/utm.py | 14 ++++++----- uoftscrapers/scrapers/athletics/utsc.py | 14 ++++++----- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/uoftscrapers/scrapers/athletics/__init__.py b/uoftscrapers/scrapers/athletics/__init__.py index f3790a0..4e168e6 100644 --- a/uoftscrapers/scrapers/athletics/__init__.py +++ b/uoftscrapers/scrapers/athletics/__init__.py @@ -3,13 +3,31 @@ from .utm import UTMAthletics from .utsc import UTSCAthletics +from collections import OrderedDict -class Athletics: +class Athletics: @staticmethod - def scrape(location='.'): + def scrape(location='.', month=None): Scraper.logger.info('Athletics initialized.') - UTSGAthletics.scrape(location) - UTMAthletics.scrape(location) - UTSCAthletics.scrape(location) + + docs = OrderedDict() + + for campus in UTSGAthletics, UTMAthletics, UTSCAthletics: + athletics = campus.scrape(location, month=month, save=False) + + if athletics is None: + continue + + for date, data in athletics.items(): + if date not in docs: + docs[date] = OrderedDict([ + ('date', date), + ('events', []) + ]) + docs[date]['events'].extend(data['events']) + + for date, doc in docs.items(): + Scraper.save_json(doc, location, date) + Scraper.logger.info('Athletics completed.') diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index 588a467..0535e77 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -15,7 +15,7 @@ class UTMAthletics: host = 'http://www.utm.utoronto.ca/athletics/schedule/month/' @staticmethod - def scrape(location='.', month=None): + def scrape(location='.', month=None, save=True): """Update the local JSON files for this scraper.""" month = month or UTMAthletics.get_month(month) @@ -35,6 +35,7 @@ def scrape(location='.', month=None): continue events = [] + for item in td.find(class_='inner').find_all(class_='item'): # event cancelled or athletic center closed @@ -52,20 +53,20 @@ def scrape(location='.', month=None): events.append(OrderedDict([ ('title', title), ('location', location_), + ('campus', 'UTM'), ('building_id', '332'), ('start_time', start), ('end_time', end) ])) - athletics[id_] = OrderedDict([ - ('id', id_), + athletics[date] = OrderedDict([ ('date', date), - ('campus', 'UTM'), ('events', events) ]) - for id_, doc in athletics.items(): - Scraper.save_json(doc, location, id_) + if save: + for id_, doc in athletics.items(): + Scraper.save_json(doc, location, id_) Scraper.logger.info('UTMAthletics completed.') @@ -85,3 +86,4 @@ def date_in_month(d, m): m = datetime.strptime(m, '%Y-%m') return d.month == m.month + return athletics diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index 44e8a5c..8c7d3f7 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -15,7 +15,7 @@ class UTSCAthletics: host = 'http://www.utsc.utoronto.ca/athletics/calendar-node-field-date-time/month/' @staticmethod - def scrape(location='.', month=None): + def scrape(location='.', month=None, save=True): """Update the local JSON files for this scraper.""" month = month or UTSCAthletics.get_month(month) @@ -35,6 +35,7 @@ def scrape(location='.', month=None): continue events = [] + for item in td.find(class_='inner').find_all(class_='item'): title = item.find(class_='views-field-title').text.strip() @@ -51,20 +52,20 @@ def scrape(location='.', month=None): events.append(OrderedDict([ ('title', title.replace('/ ', '/')), ('location', location_), + ('campus', 'UTSC'), ('building_id', '208'), ('start_time', start), ('end_time', end) ])) - athletics[id_] = OrderedDict([ - ('id', id_), + athletics[date] = OrderedDict([ ('date', date), - ('campus', 'UTSC'), ('events', events) ]) - for id_, doc in athletics.items(): - Scraper.save_json(doc, location, id_) + if save: + for id_, doc in athletics.items(): + Scraper.save_json(doc, location, id_) Scraper.logger.info('UTSCAthletics completed.') @@ -84,3 +85,4 @@ def date_in_month(d, m): m = datetime.strptime(m, '%Y-%m') return d.month == m.month + return athletics From dcc7d27e9a961f6cfa11c44a43dc9c5e943c48b6 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 19:16:26 -0400 Subject: [PATCH 2/8] Modularize helper functions --- .../scrapers/athletics/athletics_helpers.py | 19 +++++++++++++++ uoftscrapers/scrapers/athletics/utm.py | 24 ++++--------------- uoftscrapers/scrapers/athletics/utsc.py | 24 ++++--------------- 3 files changed, 27 insertions(+), 40 deletions(-) create mode 100644 uoftscrapers/scrapers/athletics/athletics_helpers.py diff --git a/uoftscrapers/scrapers/athletics/athletics_helpers.py b/uoftscrapers/scrapers/athletics/athletics_helpers.py new file mode 100644 index 0000000..3ea76b5 --- /dev/null +++ b/uoftscrapers/scrapers/athletics/athletics_helpers.py @@ -0,0 +1,19 @@ +from datetime import datetime + + +def get_current_month(): + """Return current month.""" + now = datetime.now() + return '%s-%s' % (now.year, now.month) + + +def get_campus_id(d, campus): + """Return campus id, made up of date and specifier (one of SG, M, SC).""" + d = datetime.strptime(d, '%Y-%m-%d') + return '%s%s' % (str(d.day).zfill(2), campus) + + +def is_date_in_month(d, m): + """Determine if the given date is in the given month.""" + d, m = datetime.strptime(d, '%Y-%m-%d'), datetime.strptime(m, '%Y-%m') + return d.month == m.month diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index 0535e77..e3f164e 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .athletics_helpers import * from bs4 import BeautifulSoup from datetime import datetime from collections import OrderedDict @@ -17,7 +18,7 @@ class UTMAthletics: @staticmethod def scrape(location='.', month=None, save=True): """Update the local JSON files for this scraper.""" - month = month or UTMAthletics.get_month(month) + month = month or get_current_month() Scraper.logger.info('UTMAthletics initialized.') html = Scraper.get('%s%s' % (UTMAthletics.host, month)) @@ -29,9 +30,9 @@ def scrape(location='.', month=None, save=True): for tr in calendar.find_all('tr', class_='single-day'): for td in tr.find_all('td'): date = td.get('data-date') - id_ = UTMAthletics.get_id(date) + id_ = get_campus_id(date, 'M') - if not UTMAthletics.date_in_month(date, month): + if not is_date_in_month(date, month): continue events = [] @@ -69,21 +70,4 @@ def scrape(location='.', month=None, save=True): Scraper.save_json(doc, location, id_) Scraper.logger.info('UTMAthletics completed.') - - @staticmethod - def get_month(m): - now = datetime.now() - return '%s-%s' % (now.year, now.month) - - @staticmethod - def get_id(d): - day = datetime.strptime(d, '%Y-%m-%d').day - return '%s%s' % (str(day).zfill(2), 'M') - - @staticmethod - def date_in_month(d, m): - d = datetime.strptime(d, '%Y-%m-%d') - m = datetime.strptime(m, '%Y-%m') - - return d.month == m.month return athletics diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index 8c7d3f7..3088c25 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .athletics_helpers import * from bs4 import BeautifulSoup from datetime import datetime from collections import OrderedDict @@ -17,7 +18,7 @@ class UTSCAthletics: @staticmethod def scrape(location='.', month=None, save=True): """Update the local JSON files for this scraper.""" - month = month or UTSCAthletics.get_month(month) + month = month or get_current_month() Scraper.logger.info('UTSCAthletics initialized.') html = Scraper.get('%s%s' % (UTSCAthletics.host, month)) @@ -29,9 +30,9 @@ def scrape(location='.', month=None, save=True): for tr in calendar.find_all('tr', class_='single-day'): for td in tr.find_all('td'): date = td.get('data-date') - id_ = UTSCAthletics.get_id(date) + id_ = get_campus_id(date, 'SC') - if not UTSCAthletics.date_in_month(date, month): + if not is_date_in_month(date, month): continue events = [] @@ -68,21 +69,4 @@ def scrape(location='.', month=None, save=True): Scraper.save_json(doc, location, id_) Scraper.logger.info('UTSCAthletics completed.') - - @staticmethod - def get_month(m): - now = datetime.now() - return '%s-%s' % (now.year, now.month) - - @staticmethod - def get_id(d): - day = datetime.strptime(d, '%Y-%m-%d').day - return '%s%s' % (str(day).zfill(2), 'SC') - - @staticmethod - def date_in_month(d, m): - d = datetime.strptime(d, '%Y-%m-%d') - m = datetime.strptime(m, '%Y-%m') - - return d.month == m.month return athletics From c0b66f13b277e76142a97de824385caae8fb8467 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 19:17:57 -0400 Subject: [PATCH 3/8] Update UTSGAthletics --- uoftscrapers/scrapers/athletics/utsg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uoftscrapers/scrapers/athletics/utsg.py b/uoftscrapers/scrapers/athletics/utsg.py index 6073a67..ca0cfd3 100644 --- a/uoftscrapers/scrapers/athletics/utsg.py +++ b/uoftscrapers/scrapers/athletics/utsg.py @@ -8,7 +8,7 @@ class UTSGAthletics: @staticmethod - def scrape(location='.'): + def scrape(location='.', month=None, save=True): Scraper.logger.info('UTSGAthletics initialized.') Scraper.logger.info('Not implemented.') Scraper.logger.info('UTSGAthletics completed.') From 9d8deca8887d3c58de3defb1a0171e2b53230d31 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 21:37:31 -0400 Subject: [PATCH 4/8] Convert start, end time to seconds --- uoftscrapers/scrapers/athletics/athletics_helpers.py | 6 ++++++ uoftscrapers/scrapers/athletics/utm.py | 6 +++--- uoftscrapers/scrapers/athletics/utsc.py | 5 ++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/uoftscrapers/scrapers/athletics/athletics_helpers.py b/uoftscrapers/scrapers/athletics/athletics_helpers.py index 3ea76b5..3a701c6 100644 --- a/uoftscrapers/scrapers/athletics/athletics_helpers.py +++ b/uoftscrapers/scrapers/athletics/athletics_helpers.py @@ -17,3 +17,9 @@ def is_date_in_month(d, m): """Determine if the given date is in the given month.""" d, m = datetime.strptime(d, '%Y-%m-%d'), datetime.strptime(m, '%Y-%m') return d.month == m.month + + +def convert_time(dt): + """Convert datetime from ISO 8601 format to seconds since midnight.""" + dt = datetime.strptime(dt[:19], '%Y-%m-%dT%H:%M:%S') + return dt.hour * 60 * 60 + dt.minute * 60 diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index e3f164e..6d5cb05 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -48,8 +48,9 @@ def scrape(location='.', month=None, save=True): title = item.find(class_='athletics-calendar-title').text location_ = item.find(class_='athletics-calendar-location').text - start = item.find(class_='date-display-start').get('content') - end = item.find(class_='date-display-end').get('content') + + start = convert_time(item.find(class_='date-display-start').get('content')) + end = convert_time(item.find(class_='date-display-end').get('content')) events.append(OrderedDict([ ('title', title), @@ -57,7 +58,6 @@ def scrape(location='.', month=None, save=True): ('campus', 'UTM'), ('building_id', '332'), ('start_time', start), - ('end_time', end) ])) athletics[date] = OrderedDict([ diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index 3088c25..455630a 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -47,8 +47,8 @@ def scrape(location='.', month=None, save=True): location_ = location_.text.strip() - start = item.find(class_='date-display-start').get('content') - end = item.find(class_='date-display-end').get('content') + start = convert_time(item.find(class_='date-display-start').get('content')) + end = convert_time(item.find(class_='date-display-end').get('content')) events.append(OrderedDict([ ('title', title.replace('/ ', '/')), @@ -56,7 +56,6 @@ def scrape(location='.', month=None, save=True): ('campus', 'UTSC'), ('building_id', '208'), ('start_time', start), - ('end_time', end) ])) athletics[date] = OrderedDict([ From 74e5d12ffb0dbf8318e3e5915cbfc74a63839055 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 21:38:03 -0400 Subject: [PATCH 5/8] Add duration key --- uoftscrapers/scrapers/athletics/utm.py | 4 ++++ uoftscrapers/scrapers/athletics/utsc.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index 6d5cb05..33127b1 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -52,12 +52,16 @@ def scrape(location='.', month=None, save=True): start = convert_time(item.find(class_='date-display-start').get('content')) end = convert_time(item.find(class_='date-display-end').get('content')) + duration = end - start + events.append(OrderedDict([ ('title', title), ('location', location_), ('campus', 'UTM'), ('building_id', '332'), ('start_time', start), + ('end_time', end), + ('duration', duration) ])) athletics[date] = OrderedDict([ diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index 455630a..e1e806c 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -50,12 +50,16 @@ def scrape(location='.', month=None, save=True): start = convert_time(item.find(class_='date-display-start').get('content')) end = convert_time(item.find(class_='date-display-end').get('content')) + duration = end - start + events.append(OrderedDict([ ('title', title.replace('/ ', '/')), ('location', location_), ('campus', 'UTSC'), ('building_id', '208'), ('start_time', start), + ('end_time', end), + ('duration', duration) ])) athletics[date] = OrderedDict([ From 845be1b86554eaa60a0eed84b0b39ee958a3740d Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 21:52:22 -0400 Subject: [PATCH 6/8] Patch 12 pm issue 12 pm -> 24:00 -> 86400 seconds --- uoftscrapers/scrapers/food/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uoftscrapers/scrapers/food/__init__.py b/uoftscrapers/scrapers/food/__init__.py index cee3e44..cc09887 100644 --- a/uoftscrapers/scrapers/food/__init__.py +++ b/uoftscrapers/scrapers/food/__init__.py @@ -93,7 +93,7 @@ def conv_time(t): else: h = int(time) - h += 12 if period == 'p.m.' else 0 + h += 12 if period == 'p.m.' and h != 12 else 0 return (60 * 60 * h) + (60 * m) headers = { From c19aff6a71a33396486edf691d23b9a46fb65938 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 22:40:40 -0400 Subject: [PATCH 7/8] Reorder keys --- uoftscrapers/scrapers/athletics/utm.py | 2 +- uoftscrapers/scrapers/athletics/utsc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index 33127b1..95663c2 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -56,8 +56,8 @@ def scrape(location='.', month=None, save=True): events.append(OrderedDict([ ('title', title), - ('location', location_), ('campus', 'UTM'), + ('location', location_), ('building_id', '332'), ('start_time', start), ('end_time', end), diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index e1e806c..5d6a34b 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -54,8 +54,8 @@ def scrape(location='.', month=None, save=True): events.append(OrderedDict([ ('title', title.replace('/ ', '/')), - ('location', location_), ('campus', 'UTSC'), + ('location', location_), ('building_id', '208'), ('start_time', start), ('end_time', end), From 21818f384557c498c06125576d28ab044f949d04 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 27 Apr 2016 22:41:05 -0400 Subject: [PATCH 8/8] Update Athletics schema --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e0cc22a..66ffafc 100644 --- a/README.md +++ b/README.md @@ -475,15 +475,15 @@ uoftscrapers.Athletics ##### Output format ```js { - "id": String, "date": String, - "campus": String, "events":[{ "title": String, + "campus": String, "location": String, "building_id": String, - "start_time": String, - "end_time": String + "start_time": Number, + "end_time": Number, + "duration": Number }] } ```