Skip to content

Commit

Permalink
Merge pull request #68 from kshvmdn/merge-dates
Browse files Browse the repository at this point in the history
Merge event data for campuses based on date
  • Loading branch information
qasim committed Apr 28, 2016
2 parents 6246bd4 + 21818f3 commit 336b327
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 69 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -475,15 +475,15 @@ uoftscrapers.Athletics
##### Output format
```js
{
"id": String,
"date": String,
"campus": String,
"events":[{
"title": String,
"campus": String,
"location": String,
"building_id": String,
"start_time": String,
"end_time": String
"start_time": Number,
"end_time": Number,
"duration": Number
}]
}
```
Expand Down
28 changes: 23 additions & 5 deletions uoftscrapers/scrapers/athletics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,31 @@
from .utm import UTMAthletics
from .utsc import UTSCAthletics

from collections import OrderedDict

class Athletics:

class Athletics:
@staticmethod
def scrape(location='.'):
def scrape(location='.', month=None):
Scraper.logger.info('Athletics initialized.')
UTSGAthletics.scrape(location)
UTMAthletics.scrape(location)
UTSCAthletics.scrape(location)

docs = OrderedDict()

for campus in UTSGAthletics, UTMAthletics, UTSCAthletics:
athletics = campus.scrape(location, month=month, save=False)

if athletics is None:
continue

for date, data in athletics.items():
if date not in docs:
docs[date] = OrderedDict([
('date', date),
('events', [])
])
docs[date]['events'].extend(data['events'])

for date, doc in docs.items():
Scraper.save_json(doc, location, date)

Scraper.logger.info('Athletics completed.')
25 changes: 25 additions & 0 deletions uoftscrapers/scrapers/athletics/athletics_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from datetime import datetime


def get_current_month():
"""Return current month."""
now = datetime.now()
return '%s-%s' % (now.year, now.month)


def get_campus_id(d, campus):
"""Return campus id, made up of date and specifier (one of SG, M, SC)."""
d = datetime.strptime(d, '%Y-%m-%d')
return '%s%s' % (str(d.day).zfill(2), campus)


def is_date_in_month(d, m):
"""Determine if the given date is in the given month."""
d, m = datetime.strptime(d, '%Y-%m-%d'), datetime.strptime(m, '%Y-%m')
return d.month == m.month


def convert_time(dt):
"""Convert datetime from ISO 8601 format to seconds since midnight."""
dt = datetime.strptime(dt[:19], '%Y-%m-%dT%H:%M:%S')
return dt.hour * 60 * 60 + dt.minute * 60
48 changes: 19 additions & 29 deletions uoftscrapers/scrapers/athletics/utm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ..utils import Scraper
from .athletics_helpers import *
from bs4 import BeautifulSoup
from datetime import datetime
from collections import OrderedDict
Expand All @@ -15,9 +16,9 @@ class UTMAthletics:
host = 'http://www.utm.utoronto.ca/athletics/schedule/month/'

@staticmethod
def scrape(location='.', month=None):
def scrape(location='.', month=None, save=True):
"""Update the local JSON files for this scraper."""
month = month or UTMAthletics.get_month(month)
month = month or get_current_month()

Scraper.logger.info('UTMAthletics initialized.')
html = Scraper.get('%s%s' % (UTMAthletics.host, month))
Expand All @@ -29,12 +30,13 @@ def scrape(location='.', month=None):
for tr in calendar.find_all('tr', class_='single-day'):
for td in tr.find_all('td'):
date = td.get('data-date')
id_ = UTMAthletics.get_id(date)
id_ = get_campus_id(date, 'M')

if not UTMAthletics.date_in_month(date, month):
if not is_date_in_month(date, month):
continue

events = []

for item in td.find(class_='inner').find_all(class_='item'):

# event cancelled or athletic center closed
Expand All @@ -46,42 +48,30 @@ def scrape(location='.', month=None):

title = item.find(class_='athletics-calendar-title').text
location_ = item.find(class_='athletics-calendar-location').text
start = item.find(class_='date-display-start').get('content')
end = item.find(class_='date-display-end').get('content')

start = convert_time(item.find(class_='date-display-start').get('content'))
end = convert_time(item.find(class_='date-display-end').get('content'))

duration = end - start

events.append(OrderedDict([
('title', title),
('campus', 'UTM'),
('location', location_),
('building_id', '332'),
('start_time', start),
('end_time', end)
('end_time', end),
('duration', duration)
]))

athletics[id_] = OrderedDict([
('id', id_),
athletics[date] = OrderedDict([
('date', date),
('campus', 'UTM'),
('events', events)
])

for id_, doc in athletics.items():
Scraper.save_json(doc, location, id_)
if save:
for id_, doc in athletics.items():
Scraper.save_json(doc, location, id_)

Scraper.logger.info('UTMAthletics completed.')

@staticmethod
def get_month(m):
now = datetime.now()
return '%s-%s' % (now.year, now.month)

@staticmethod
def get_id(d):
day = datetime.strptime(d, '%Y-%m-%d').day
return '%s%s' % (str(day).zfill(2), 'M')

@staticmethod
def date_in_month(d, m):
d = datetime.strptime(d, '%Y-%m-%d')
m = datetime.strptime(m, '%Y-%m')

return d.month == m.month
return athletics
47 changes: 18 additions & 29 deletions uoftscrapers/scrapers/athletics/utsc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ..utils import Scraper
from .athletics_helpers import *
from bs4 import BeautifulSoup
from datetime import datetime
from collections import OrderedDict
Expand All @@ -15,9 +16,9 @@ class UTSCAthletics:
host = 'http://www.utsc.utoronto.ca/athletics/calendar-node-field-date-time/month/'

@staticmethod
def scrape(location='.', month=None):
def scrape(location='.', month=None, save=True):
"""Update the local JSON files for this scraper."""
month = month or UTSCAthletics.get_month(month)
month = month or get_current_month()

Scraper.logger.info('UTSCAthletics initialized.')
html = Scraper.get('%s%s' % (UTSCAthletics.host, month))
Expand All @@ -29,12 +30,13 @@ def scrape(location='.', month=None):
for tr in calendar.find_all('tr', class_='single-day'):
for td in tr.find_all('td'):
date = td.get('data-date')
id_ = UTSCAthletics.get_id(date)
id_ = get_campus_id(date, 'SC')

if not UTSCAthletics.date_in_month(date, month):
if not is_date_in_month(date, month):
continue

events = []

for item in td.find(class_='inner').find_all(class_='item'):
title = item.find(class_='views-field-title').text.strip()

Expand All @@ -45,42 +47,29 @@ def scrape(location='.', month=None):

location_ = location_.text.strip()

start = item.find(class_='date-display-start').get('content')
end = item.find(class_='date-display-end').get('content')
start = convert_time(item.find(class_='date-display-start').get('content'))
end = convert_time(item.find(class_='date-display-end').get('content'))

duration = end - start

events.append(OrderedDict([
('title', title.replace('/ ', '/')),
('campus', 'UTSC'),
('location', location_),
('building_id', '208'),
('start_time', start),
('end_time', end)
('end_time', end),
('duration', duration)
]))

athletics[id_] = OrderedDict([
('id', id_),
athletics[date] = OrderedDict([
('date', date),
('campus', 'UTSC'),
('events', events)
])

for id_, doc in athletics.items():
Scraper.save_json(doc, location, id_)
if save:
for id_, doc in athletics.items():
Scraper.save_json(doc, location, id_)

Scraper.logger.info('UTSCAthletics completed.')

@staticmethod
def get_month(m):
now = datetime.now()
return '%s-%s' % (now.year, now.month)

@staticmethod
def get_id(d):
day = datetime.strptime(d, '%Y-%m-%d').day
return '%s%s' % (str(day).zfill(2), 'SC')

@staticmethod
def date_in_month(d, m):
d = datetime.strptime(d, '%Y-%m-%d')
m = datetime.strptime(m, '%Y-%m')

return d.month == m.month
return athletics
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/athletics/utsg.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class UTSGAthletics:

@staticmethod
def scrape(location='.'):
def scrape(location='.', month=None, save=True):
Scraper.logger.info('UTSGAthletics initialized.')
Scraper.logger.info('Not implemented.')
Scraper.logger.info('UTSGAthletics completed.')
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/food/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def conv_time(t):
else:
h = int(time)

h += 12 if period == 'p.m.' else 0
h += 12 if period == 'p.m.' and h != 12 else 0
return (60 * 60 * h) + (60 * m)

headers = {
Expand Down

0 comments on commit 336b327

Please sign in to comment.