diff --git a/uoftscrapers/scrapers/exams/exams_helpers.py b/uoftscrapers/scrapers/exams/exams_helpers.py new file mode 100644 index 0000000..90b6f93 --- /dev/null +++ b/uoftscrapers/scrapers/exams/exams_helpers.py @@ -0,0 +1,64 @@ +from datetime import datetime + + +def convert_time(t): + t = [int(x) for x in t.split(':')] + + converted = 0 + for i in range(min(len(t), 3)): + converted += t[i] * (60 ** (2-i)) + return converted + + +def get_period(d): + def get_date(month, date, year): + month = 'jun' if month == 'june' else month + return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') + + d = datetime.strptime(d, '%Y-%m-%d') + year, month = d.year, None + + for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): + if get_date(m, 1, year) <= d <= get_date(m, ld, year): + month = m + break + + if month: + return '%s%s' % (month.upper(), str(year)[2:]) + + +def get_course_id(course_code, date): + d = datetime.strptime(date, '%Y-%m-%d') + + month, year = d.strftime('%b').lower(), d.year + month = 'june' if month == 'jun' else month + + endings = { + 'dec': { + 'F': '%d9' % year, + 'Y': '%d9' % (year - 1) + }, + 'apr': { + 'S': '%d1' % year, + 'Y': '%d9' % (year - 1) + }, + 'june': { + 'F': '%d5F' % year, + 'Y': '%d5' % year + }, + 'aug': { + 'S': '%d5S' % year, + 'Y': '%d5' % year + } + } + + season = course_code[-1] + period = get_period(date) + + exam_id = course_id = None + + if month in endings and season in endings[month]: + course_id = '%s%s' % (course_code, endings[month][season]) + exam_id = '%s%s' % (course_id, period) + + return exam_id, course_id diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py index 3d1684f..e42e545 100644 --- a/uoftscrapers/scrapers/exams/utm.py +++ b/uoftscrapers/scrapers/exams/utm.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .exams_helpers import * from bs4 import BeautifulSoup from collections import OrderedDict from datetime import datetime @@ -33,7 +34,6 @@ def scrape(location='.'): @staticmethod def retrieve_exams(courses): - exams = OrderedDict() for course in courses: @@ -56,15 +56,15 @@ def retrieve_exams(courses): date = data[0].split(': ')[1] - id_, course_id = UTMExams.get_course_id(course_code, date) + exam_id, course_id = get_course_id(course_code, date) - period = UTMExams.get_period(date) + period = get_period(date) - if not id_ or not period: + if not exam_id or not period: continue - start, end = UTMExams.parse_time(data[1].split(': ')[1], - data[2].split(': ')[1], date) + start = convert_time(data[1].split(': ')[1]) + end = convert_time(data[2].split(': ')[1]) duration = end - start sections = [UTMExams.parse_sections(room.split(': ')[1]) @@ -75,7 +75,7 @@ def retrieve_exams(courses): sections[i]['lecture'] = lecture_code or '' doc = OrderedDict([ - ('id', id_), + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTM'), @@ -87,11 +87,11 @@ def retrieve_exams(courses): ('sections', []) ]) - if id_ not in exams: - exams[id_] = doc + if exam_id not in exams: + exams[exam_id] = doc for section in sections: - exams[id_]['sections'].append(OrderedDict([ + exams[exam_id]['sections'].append(OrderedDict([ ('lecture_code', section['lecture']), ('exam_section', section['section']), ('location', section['room']) @@ -109,63 +109,6 @@ def get_page_links(endpoint): return [li.find('a')['href'] for li in soup.find('ul', class_='link').find_all('li')] - @staticmethod - def get_period(d): - def get_date(month, date, year): - months = { - 'dec': 12, - 'apr': 4, - 'june': 6, - 'aug': 8 - } - return datetime.strptime('%s-%d-%d' % (year, months[month], date), - '%Y-%m-%d') - - d = datetime.strptime(d, '%Y-%m-%d') - - year = d.year - month = None - - for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): - if get_date(m, 1, year) <= d <= get_date(m, ld, year): - month = m - break - - if month: - return '%s%s' % (month.upper(), str(year)[2:]) - - @staticmethod - def get_course_id(course_code, date): - d = datetime.strptime(date, '%Y-%m-%d') - month, year, period = d.strftime('%b').lower(), d.year, UTMExams.get_period(date) - endings = { - 'dec': { - 'F': '%s9' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'apr': { - 'S': '%s1' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'june': { - 'F': '%s5F' % str(year), - 'Y': '%s5' % str(year) - }, - 'aug': { - 'S': '%s5S' % str(year), - 'Y': '%s5' % str(year) - } - } - - season = course_code[-1] - exam_id = course_id = None - - if month in endings and season in endings[month]: - course_id = '%s%s' % (course_code, endings[month][season]) - exam_id = '%s%s' % (course_id, period) - - return exam_id, course_id - @staticmethod def parse_sections(room): section = '' @@ -173,10 +116,3 @@ def parse_sections(room): room, section = [x.strip() for x in re.sub('[()]', ' ', room).split(' ')] return {'section': section, 'room': room} - - @staticmethod - def parse_time(start, end, date): - def convert_time(t): - h, m, s = [int(x) for x in t.split(':')] - return (h * 60 * 60) + (m * 60) + s - return convert_time(start), convert_time(end) diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py index 42f35d7..a588e3b 100644 --- a/uoftscrapers/scrapers/exams/utsc.py +++ b/uoftscrapers/scrapers/exams/utsc.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .exams_helpers import * from bs4 import BeautifulSoup from collections import OrderedDict from datetime import datetime @@ -31,20 +32,22 @@ def scrape(location='.'): course_code, lecture_code = course_code.split(' ') date = data[1] - start, end = UTSCExams.parse_time(data[2], data[3], date) - duration = end - start - - location_ = data[4] - id_, course_id = UTSCExams.get_course_id(course_code, date) + exam_id, course_id = get_course_id(course_code, date) - period = UTSCExams.get_period(date) + period = get_period(date) - if not id_ or not period: + if not exam_id or not period: continue + start = convert_time(data[2]) + end = convert_time(data[3]) + duration = end - start + + location_ = data[4] + doc = OrderedDict([ - ('id', id_), + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTSC'), @@ -56,10 +59,10 @@ def scrape(location='.'): ('sections', []) ]) - if id_ not in exams: - exams[id_] = doc + if exam_id not in exams: + exams[exam_id] = doc - exams[id_]['sections'].append(OrderedDict([ + exams[exam_id]['sections'].append(OrderedDict([ ('lecture_code', lecture_code or ''), ('exam_section', ''), ('location', location_) @@ -69,68 +72,3 @@ def scrape(location='.'): Scraper.save_json(doc, location, id_) Scraper.logger.info('UTSCExams completed.') - - @staticmethod - def get_period(d): - def get_date(month, date, year): - months = { - 'dec': 12, - 'apr': 4, - 'june': 6, - 'aug': 8 - } - return datetime.strptime('%s-%d-%d' % (year, months[month], date), - '%Y-%m-%d') - - d = datetime.strptime(d, '%Y-%m-%d') - - year = d.year - month = None - - for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): - if get_date(m, 1, year) <= d <= get_date(m, ld, year): - month = m - break - - if month: - return '%s%s' % (month.upper(), str(year)[2:]) - - @staticmethod - def get_course_id(course_code, date): - d = datetime.strptime(date, '%Y-%m-%d') - month, year, period = d.strftime( - "%b").lower(), d.year, UTSCExams.get_period(date) - endings = { - 'dec': { - 'F': '%s9' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'apr': { - 'S': '%s1' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'june': { - 'F': '%s5F' % str(year), - 'Y': '%s5' % str(year) - }, - 'aug': { - 'S': '%s5S' % str(year), - 'Y': '%s5' % str(year) - } - } - - season = course_code[-1] - exam_id = course_id = None - - if month in endings and season in endings[month]: - course_id = '%s%s' % (course_code, endings[month][season]) - exam_id = '%s%s' % (course_id, period) - - return exam_id, course_id - - @staticmethod - def parse_time(start, end, date): - def convert_time(t): - h, m = [int(x) for x in t.split(':')] - return (h * 60 * 60) + (m * 60) - return convert_time(start), convert_time(end) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index ffa7e1b..1e24b0a 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .exams_helpers import * from bs4 import BeautifulSoup from collections import OrderedDict from datetime import datetime @@ -6,7 +7,26 @@ class UTSGExams: - """A scraper for UTSG exams. + """A scraper for UTSG exams.""" + + @staticmethod + def scrape(location='.', year=None): + """Update the local JSON files for this scraper.""" + + Scraper.logger.info('UTSGExams initialized.') + + for faculty in ArtSciExams, EngExams: + exams = faculty.scrape(location=location, year=year, save=False) + if exams is None: + continue + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) + + Scraper.logger.info('UTSGExams completed.') + + +class ArtSciExams: + """A scraper for Arts & Science exams. Data is scraped from http://www.artsci.utoronto.ca/current/exams/ """ @@ -14,20 +34,20 @@ class UTSGExams: host = 'http://www.artsci.utoronto.ca/current/exams/' @staticmethod - def scrape(location='.', year=None): + def scrape(location='.', year=None, save=True): """Update the local JSON files for this scraper.""" - Scraper.logger.info('UTSGExams initialized.') + Scraper.logger.info('ArtSciExams initialized.') exams = OrderedDict() - for p in UTSGExams.get_exam_periods(year): + for p in ArtSciExams.get_exam_periods(year): Scraper.logger.info('Scraping %s exams.' % p.upper()) headers = { - 'Referer': UTSGExams.host + 'Referer': ArtSciExams.host } - html = Scraper.get('%s%s' % (UTSGExams.host, p), + html = Scraper.get('%s%s' % (ArtSciExams.host, p), headers=headers, max_attempts=3) @@ -45,9 +65,10 @@ def scrape(location='.', year=None): for row in rows[1:]: data = [x.text.strip() for x in row.find_all('td')] - id_, course_id, course_code = UTSGExams.parse_course_info(p, data[0]) + exam_id, course_id, course_code = \ + ArtSciExams.parse_course_info(p, data[0]) - if id_ is None: + if exam_id is None: continue section = data[1] @@ -63,12 +84,12 @@ def scrape(location='.', year=None): location_ = data[4] - date = UTSGExams.parse_date(data[2], p[-2:]) or '' - start, end = UTSGExams.parse_time(data[3], date) or (0, 0) + date = ArtSciExams.parse_date(data[2], p[-2:]) or '' + start, end = ArtSciExams.parse_time(data[3], date) or (0, 0) duration = end - start doc = OrderedDict([ - ('id', id_), + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTSG'), @@ -80,19 +101,21 @@ def scrape(location='.', year=None): ('sections', []) ]) - if id_ not in exams: - exams[id_] = doc + if exam_id not in exams: + exams[exam_id] = doc - exams[id_]['sections'].append(OrderedDict([ + exams[exam_id]['sections'].append(OrderedDict([ ('lecture_code', lecture_section or ''), ('exam_section', exam_section or ''), ('location', location_) ])) - for id_, doc in exams.items(): - Scraper.save_json(doc, location, id_) + if save: + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) - Scraper.logger.info('UTSGExams completed.') + Scraper.logger.info('ArtSciExams completed.') + return exams @staticmethod def parse_course_info(period, course_code): @@ -128,12 +151,8 @@ def parse_course_info(period, course_code): @staticmethod def parse_date(date, year): """Convert date of form `D DD MMM` to ISO 8601 format.""" - - date = date.split(' ') - if len(date) == 3: - day, date, month = date - - return datetime.strptime('%s %s %s %s' % (day, date, month, year), + if date.count(' ') == 2: + return datetime.strptime('%s %s' % (date, year), '%a %d %b %y').date().isoformat() @staticmethod @@ -164,3 +183,110 @@ def get_exam_periods(year): periods.append('%s%s' % (m, str(y)[2:])) return periods + + +class EngExams: + """A scraper for Engineering exams. + + Data is scraped from http://www.apsc.utoronto.ca/timetable/fes.aspx + """ + + host = 'http://www.apsc.utoronto.ca/timetable/fes.aspx' + + @staticmethod + def scrape(location='.', year=None, save=True): + """Update the local JSON files for this scraper.""" + + Scraper.logger.info('EngExams initialized.') + + exams = OrderedDict() + + headers = { + 'Referer': EngExams.host + } + html = Scraper.get(EngExams.host, headers=headers, max_attempts=3) + soup = BeautifulSoup(html, 'html.parser') + + if soup is None: + return + + for tr in soup.find('table', id='DataList1').find_all('tr'): + for td in tr.find_all('td'): + entry = td.find('div', id='logo') + + if entry is None: + continue + + info = entry.find('div') + + date, time = [br.next_sibling.strip() + for br in info.find_all('br')[:2]] + + date = datetime.strptime(date.split(':')[-1].strip(), + '%b %d, %Y').date().isoformat() + + time = time.strip().split(':') + hour = int(time[1]) + minute, meridiem = time[2].split(' ') + + hour += 12 if meridiem == 'PM' and hour != 12 else 0 + + # No end times, using 2.5h for duration per + # http://www.undergrad.engineering.utoronto.ca/Office_of_the_Registrar/Examinations/Schedules_Locations.htm + start = hour * 60 * 60 + int(minute) * 60 + duration = 2 * 60 * 60 + 30 * 60 + end = start + duration + + period = get_period(date) + + exam_id, course_id, course_code = \ + EngExams.get_course_info(info.find('strong').text.strip(), period) + + locations = entry.find('table', class_='xx') + + exam_sections = [] + for tr in locations.find_all('tr')[1:]: + location, range = [td.text.strip() for td in tr.find_all('td')[:2]] + + exam_sections.append(OrderedDict([ + ('lecture_code', ''), + ('exam_section', range), + ('location', location.replace('-', ' ')) + ])) + + exams[exam_id] = OrderedDict([ + ('id', exam_id), + ('course_id', course_id), + ('course_code', course_code), + ('campus', 'UTSG'), + ('period', period), + ('date', date), + ('start_time', start), + ('end_time', end), + ('duration', duration), + ('sections', exam_sections) + ]) + + if save: + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) + + Scraper.logger.info('EngExams completed.') + return exams + + @staticmethod + def get_course_info(course, period): + endings = { + 'dec': {'season': 'F', 'month': '9'}, + 'apr': {'season': 'S', 'month': '1'}, + 'june': {'season': 'F', 'month': '5F'}, + 'aug': {'season': 'S', 'month': '5S'} + } + + month, year = period[:-2].lower(), period[-2:] + exam_id = course_id = course_code = None + if month in endings: + course_code = '%s%s' % (course, endings[month]['season']) + course_id = '%s20%s%s' % (course_code, year, endings[month]['month']) + exam_id = '%s%s' % (course_id, period) + return exam_id, course_id, course_code