From 31607f25a599266580aa448df37354f6f2151fe6 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 1 May 2016 04:29:07 -0400 Subject: [PATCH 1/6] Start engineering exams scraper --- uoftscrapers/scrapers/exams/utsg.py | 138 +++++++++++++++++++++++++--- 1 file changed, 126 insertions(+), 12 deletions(-) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index ffa7e1b..54ff3cc 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -4,9 +4,121 @@ from datetime import datetime from pytz import timezone +from pprint import pprint + class UTSGExams: - """A scraper for UTSG exams. + """A scraper for UTSG exams.""" + + @staticmethod + def scrape(location='.', year=None): + """Update the local JSON files for this scraper.""" + + Scraper.logger.info('UTSGExams initialized.') + + artsci_exams = ArtSciExams.scrape(location, year, False) + eng_exams = None # EngExams.scrape(location, False) + + for exams in artsci_exams, eng_exams: + if exams is None: + continue + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) + + Scraper.logger.info('UTSGExams completed.') + + +class EngExams: + """A scraper for Engineering exams. + + Data is scraped from http://www.artsci.utoronto.ca/current/exams/ + """ + + host = 'http://www.apsc.utoronto.ca/timetable/fes.aspx' + + @staticmethod + def scrape(location='.', save=True): + """Update the local JSON files for this scraper.""" + + Scraper.logger.info('EngExams initialized.') + + exams = OrderedDict() + + headers = { + 'Referer': EngExams.host + } + html = Scraper.get(EngExams.host, headers=headers, max_attempts=3) + soup = BeautifulSoup(html, 'html.parser') + + for tr in soup.find('table', id='DataList1').find_all('tr'): + for td in tr.find_all('td'): + exam = OrderedDict() + + entry = td.find('div', id='logo') + + if entry is None: + continue + + info = entry.find('div') + locations = entry.find('table', class_='xx') + + course_code = info.find('strong').text.strip() + + id_, course_id = course_code, '' + + date, time = [br.next_sibling.strip() + for br in info.find_all('br')[:2]] + + date = datetime.strptime(date.split(':')[-1].strip(), + '%b %d, %Y').date().isoformat() + + time = time.strip().split(':') + hour = int(time[1]) + minute, period = time[2].split(' ') + + hour += 12 if period == 'PM' and hour != 12 else 0 + + start = hour * 60 * 60 + int(minute) * 60 + # No end times, using 2.5h per http://www.undergrad.engineering.utoronto.ca/Office_of_the_Registrar/Examinations/Schedules_Locations.htm + duration = 2 * 60 * 60 + 30 * 60 + end = start + duration + + exam_sections = [] + for tr in locations.find_all('tr')[1:]: + location, range, _ = [td.text.strip() for td in tr.find_all('td')] + + location = location.replace('-', ' ') + range = '' if range == 'A - Z' else range + + exam_sections.append(OrderedDict([ + ('lecture_code', ''), + ('exam_section', range), + ('location', location) + ])) + + exams[id_] = OrderedDict([ + ('id', id_), + ('course_id', course_id), + ('course_code', course_code), + ('campus', 'UTSG'), + ('period', ''), + ('date', date), + ('start_time', start), + ('end_time', end), + ('duration', duration), + ('sections', exam_sections) + ]) + + if save: + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) + + Scraper.logger.info('EngExams completed.') + return exams + + +class ArtSciExams: + """A scraper for Art & Science exams. Data is scraped from http://www.artsci.utoronto.ca/current/exams/ """ @@ -14,20 +126,20 @@ class UTSGExams: host = 'http://www.artsci.utoronto.ca/current/exams/' @staticmethod - def scrape(location='.', year=None): + def scrape(location='.', year=None, save=True): """Update the local JSON files for this scraper.""" - Scraper.logger.info('UTSGExams initialized.') + Scraper.logger.info('ArtSciExams initialized.') exams = OrderedDict() - for p in UTSGExams.get_exam_periods(year): + for p in ArtSciExams.get_exam_periods(year): Scraper.logger.info('Scraping %s exams.' % p.upper()) headers = { - 'Referer': UTSGExams.host + 'Referer': ArtSciExams.host } - html = Scraper.get('%s%s' % (UTSGExams.host, p), + html = Scraper.get('%s%s' % (ArtSciExams.host, p), headers=headers, max_attempts=3) @@ -45,7 +157,7 @@ def scrape(location='.', year=None): for row in rows[1:]: data = [x.text.strip() for x in row.find_all('td')] - id_, course_id, course_code = UTSGExams.parse_course_info(p, data[0]) + id_, course_id, course_code = ArtSciExams.parse_course_info(p, data[0]) if id_ is None: continue @@ -63,8 +175,8 @@ def scrape(location='.', year=None): location_ = data[4] - date = UTSGExams.parse_date(data[2], p[-2:]) or '' - start, end = UTSGExams.parse_time(data[3], date) or (0, 0) + date = ArtSciExams.parse_date(data[2], p[-2:]) or '' + start, end = ArtSciExams.parse_time(data[3], date) or (0, 0) duration = end - start doc = OrderedDict([ @@ -89,10 +201,12 @@ def scrape(location='.', year=None): ('location', location_) ])) - for id_, doc in exams.items(): - Scraper.save_json(doc, location, id_) + if save: + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) - Scraper.logger.info('UTSGExams completed.') + Scraper.logger.info('ArtSciExams completed.') + return exams @staticmethod def parse_course_info(period, course_code): From 04c6004ba7eb2f1280b771cdc2d8819e2322fce0 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 1 May 2016 13:29:33 -0400 Subject: [PATCH 2/6] Clean up date parser --- uoftscrapers/scrapers/exams/utm.py | 11 +++-------- uoftscrapers/scrapers/exams/utsc.py | 10 ++-------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py index 3d1684f..c07880b 100644 --- a/uoftscrapers/scrapers/exams/utm.py +++ b/uoftscrapers/scrapers/exams/utm.py @@ -112,14 +112,9 @@ def get_page_links(endpoint): @staticmethod def get_period(d): def get_date(month, date, year): - months = { - 'dec': 12, - 'apr': 4, - 'june': 6, - 'aug': 8 - } - return datetime.strptime('%s-%d-%d' % (year, months[month], date), - '%Y-%m-%d') + month = 'jun' if month == 'june' else month + return datetime.strptime('%s %s %d' % (year, month, date), + '%Y %b %d') d = datetime.strptime(d, '%Y-%m-%d') diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py index 42f35d7..49cb680 100644 --- a/uoftscrapers/scrapers/exams/utsc.py +++ b/uoftscrapers/scrapers/exams/utsc.py @@ -73,14 +73,8 @@ def scrape(location='.'): @staticmethod def get_period(d): def get_date(month, date, year): - months = { - 'dec': 12, - 'apr': 4, - 'june': 6, - 'aug': 8 - } - return datetime.strptime('%s-%d-%d' % (year, months[month], date), - '%Y-%m-%d') + month = 'jun' if month == 'june' else month + return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') d = datetime.strptime(d, '%Y-%m-%d') From 1c6a28f171e0501fb99177ddb6a706258d4b60f9 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 1 May 2016 13:30:11 -0400 Subject: [PATCH 3/6] Add course/id parser --- uoftscrapers/scrapers/exams/utsg.py | 80 ++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 54ff3cc..6b421ca 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -16,10 +16,8 @@ def scrape(location='.', year=None): Scraper.logger.info('UTSGExams initialized.') - artsci_exams = ArtSciExams.scrape(location, year, False) - eng_exams = None # EngExams.scrape(location, False) - - for exams in artsci_exams, eng_exams: + for faculty in ArtSciExams, EngExams: + exams = faculty.scrape(location=location, year=year, save=False) if exams is None: continue for id_, doc in exams.items(): @@ -31,13 +29,13 @@ def scrape(location='.', year=None): class EngExams: """A scraper for Engineering exams. - Data is scraped from http://www.artsci.utoronto.ca/current/exams/ + Data is scraped from http://www.apsc.utoronto.ca/timetable/fes.aspx """ host = 'http://www.apsc.utoronto.ca/timetable/fes.aspx' @staticmethod - def scrape(location='.', save=True): + def scrape(location='.', year=None, save=True): """Update the local JSON files for this scraper.""" Scraper.logger.info('EngExams initialized.') @@ -50,10 +48,11 @@ def scrape(location='.', save=True): html = Scraper.get(EngExams.host, headers=headers, max_attempts=3) soup = BeautifulSoup(html, 'html.parser') + if soup is None: + return + for tr in soup.find('table', id='DataList1').find_all('tr'): for td in tr.find_all('td'): - exam = OrderedDict() - entry = td.find('div', id='logo') if entry is None: @@ -62,10 +61,6 @@ def scrape(location='.', save=True): info = entry.find('div') locations = entry.find('table', class_='xx') - course_code = info.find('strong').text.strip() - - id_, course_id = course_code, '' - date, time = [br.next_sibling.strip() for br in info.find_all('br')[:2]] @@ -74,34 +69,37 @@ def scrape(location='.', save=True): time = time.strip().split(':') hour = int(time[1]) - minute, period = time[2].split(' ') + minute, meridiem = time[2].split(' ') + + period = EngExams.get_period(date) + + exam_id, course_id, course_code = \ + EngExams.get_course_info(info.find('strong').text.strip(), period) - hour += 12 if period == 'PM' and hour != 12 else 0 + hour += 12 if meridiem == 'PM' and hour != 12 else 0 + # No end times, using 2.5h for duration per + # http://www.undergrad.engineering.utoronto.ca/Office_of_the_Registrar/Examinations/Schedules_Locations.htm start = hour * 60 * 60 + int(minute) * 60 - # No end times, using 2.5h per http://www.undergrad.engineering.utoronto.ca/Office_of_the_Registrar/Examinations/Schedules_Locations.htm duration = 2 * 60 * 60 + 30 * 60 end = start + duration exam_sections = [] for tr in locations.find_all('tr')[1:]: - location, range, _ = [td.text.strip() for td in tr.find_all('td')] - - location = location.replace('-', ' ') - range = '' if range == 'A - Z' else range + location, range = [td.text.strip() for td in tr.find_all('td')[:2]] exam_sections.append(OrderedDict([ ('lecture_code', ''), ('exam_section', range), - ('location', location) + ('location', location.replace('-', ' ')) ])) - exams[id_] = OrderedDict([ - ('id', id_), + exams[exam_id] = OrderedDict([ + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTSG'), - ('period', ''), + ('period', period), ('date', date), ('start_time', start), ('end_time', end), @@ -116,6 +114,42 @@ def scrape(location='.', save=True): Scraper.logger.info('EngExams completed.') return exams + @staticmethod + def get_course_info(course, period): + endings = { + 'dec': {'season': 'F', 'month': '1'}, + 'apr': {'season': 'S', 'month': '1'}, + 'june': {'season': 'F', 'month': '5F'}, + 'aug': {'season': 'S', 'month': '5S'} + } + + month, year = period[:-2].lower(), period[-2:] + exam_id = course_id = course_code = None + if month in endings: + course_code = '%s%s' % (course, endings[month]['season']) + course_id = '%s20%s%s' % (course_code, year, endings[month]['month']) + exam_id = '%s%s' % (course_id, period) + return exam_id, course_id, course_code + + @staticmethod + def get_period(d): + def get_date(month, date, year): + month = 'jun' if month == 'june' else month + return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') + + d = datetime.strptime(d, '%Y-%m-%d') + + year = d.year + month = None + + for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): + if get_date(m, 1, year) <= d <= get_date(m, ld, year): + month = m + break + + if month: + return '%s%s' % (month.upper(), str(year)[2:]) + class ArtSciExams: """A scraper for Art & Science exams. From 20372f9418fb9fb8fc26e31fd56ebcb182dc5df9 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 1 May 2016 13:38:27 -0400 Subject: [PATCH 4/6] Clean up --- uoftscrapers/scrapers/exams/utsg.py | 255 ++++++++++++++-------------- 1 file changed, 127 insertions(+), 128 deletions(-) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 6b421ca..911c759 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -4,8 +4,6 @@ from datetime import datetime from pytz import timezone -from pprint import pprint - class UTSGExams: """A scraper for UTSG exams.""" @@ -26,133 +24,8 @@ def scrape(location='.', year=None): Scraper.logger.info('UTSGExams completed.') -class EngExams: - """A scraper for Engineering exams. - - Data is scraped from http://www.apsc.utoronto.ca/timetable/fes.aspx - """ - - host = 'http://www.apsc.utoronto.ca/timetable/fes.aspx' - - @staticmethod - def scrape(location='.', year=None, save=True): - """Update the local JSON files for this scraper.""" - - Scraper.logger.info('EngExams initialized.') - - exams = OrderedDict() - - headers = { - 'Referer': EngExams.host - } - html = Scraper.get(EngExams.host, headers=headers, max_attempts=3) - soup = BeautifulSoup(html, 'html.parser') - - if soup is None: - return - - for tr in soup.find('table', id='DataList1').find_all('tr'): - for td in tr.find_all('td'): - entry = td.find('div', id='logo') - - if entry is None: - continue - - info = entry.find('div') - locations = entry.find('table', class_='xx') - - date, time = [br.next_sibling.strip() - for br in info.find_all('br')[:2]] - - date = datetime.strptime(date.split(':')[-1].strip(), - '%b %d, %Y').date().isoformat() - - time = time.strip().split(':') - hour = int(time[1]) - minute, meridiem = time[2].split(' ') - - period = EngExams.get_period(date) - - exam_id, course_id, course_code = \ - EngExams.get_course_info(info.find('strong').text.strip(), period) - - hour += 12 if meridiem == 'PM' and hour != 12 else 0 - - # No end times, using 2.5h for duration per - # http://www.undergrad.engineering.utoronto.ca/Office_of_the_Registrar/Examinations/Schedules_Locations.htm - start = hour * 60 * 60 + int(minute) * 60 - duration = 2 * 60 * 60 + 30 * 60 - end = start + duration - - exam_sections = [] - for tr in locations.find_all('tr')[1:]: - location, range = [td.text.strip() for td in tr.find_all('td')[:2]] - - exam_sections.append(OrderedDict([ - ('lecture_code', ''), - ('exam_section', range), - ('location', location.replace('-', ' ')) - ])) - - exams[exam_id] = OrderedDict([ - ('id', exam_id), - ('course_id', course_id), - ('course_code', course_code), - ('campus', 'UTSG'), - ('period', period), - ('date', date), - ('start_time', start), - ('end_time', end), - ('duration', duration), - ('sections', exam_sections) - ]) - - if save: - for id_, doc in exams.items(): - Scraper.save_json(doc, location, id_) - - Scraper.logger.info('EngExams completed.') - return exams - - @staticmethod - def get_course_info(course, period): - endings = { - 'dec': {'season': 'F', 'month': '1'}, - 'apr': {'season': 'S', 'month': '1'}, - 'june': {'season': 'F', 'month': '5F'}, - 'aug': {'season': 'S', 'month': '5S'} - } - - month, year = period[:-2].lower(), period[-2:] - exam_id = course_id = course_code = None - if month in endings: - course_code = '%s%s' % (course, endings[month]['season']) - course_id = '%s20%s%s' % (course_code, year, endings[month]['month']) - exam_id = '%s%s' % (course_id, period) - return exam_id, course_id, course_code - - @staticmethod - def get_period(d): - def get_date(month, date, year): - month = 'jun' if month == 'june' else month - return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') - - d = datetime.strptime(d, '%Y-%m-%d') - - year = d.year - month = None - - for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): - if get_date(m, 1, year) <= d <= get_date(m, ld, year): - month = m - break - - if month: - return '%s%s' % (month.upper(), str(year)[2:]) - - class ArtSciExams: - """A scraper for Art & Science exams. + """A scraper for Arts & Science exams. Data is scraped from http://www.artsci.utoronto.ca/current/exams/ """ @@ -312,3 +185,129 @@ def get_exam_periods(year): periods.append('%s%s' % (m, str(y)[2:])) return periods + + +class EngExams: + """A scraper for Engineering exams. + + Data is scraped from http://www.apsc.utoronto.ca/timetable/fes.aspx + """ + + host = 'http://www.apsc.utoronto.ca/timetable/fes.aspx' + + @staticmethod + def scrape(location='.', year=None, save=True): + """Update the local JSON files for this scraper.""" + + Scraper.logger.info('EngExams initialized.') + + exams = OrderedDict() + + headers = { + 'Referer': EngExams.host + } + html = Scraper.get(EngExams.host, headers=headers, max_attempts=3) + soup = BeautifulSoup(html, 'html.parser') + + if soup is None: + return + + for tr in soup.find('table', id='DataList1').find_all('tr'): + for td in tr.find_all('td'): + entry = td.find('div', id='logo') + + if entry is None: + continue + + info = entry.find('div') + + date, time = [br.next_sibling.strip() + for br in info.find_all('br')[:2]] + + date = datetime.strptime(date.split(':')[-1].strip(), + '%b %d, %Y').date().isoformat() + + time = time.strip().split(':') + hour = int(time[1]) + minute, meridiem = time[2].split(' ') + + hour += 12 if meridiem == 'PM' and hour != 12 else 0 + + # No end times, using 2.5h for duration per + # http://www.undergrad.engineering.utoronto.ca/Office_of_the_Registrar/Examinations/Schedules_Locations.htm + start = hour * 60 * 60 + int(minute) * 60 + duration = 2 * 60 * 60 + 30 * 60 + end = start + duration + + period = EngExams.get_period(date) + + exam_id, course_id, course_code = \ + EngExams.get_course_info(info.find('strong').text.strip(), period) + + locations = entry.find('table', class_='xx') + + exam_sections = [] + for tr in locations.find_all('tr')[1:]: + location, range = [td.text.strip() for td in tr.find_all('td')[:2]] + + exam_sections.append(OrderedDict([ + ('lecture_code', ''), + ('exam_section', range), + ('location', location.replace('-', ' ')) + ])) + + exams[exam_id] = OrderedDict([ + ('id', exam_id), + ('course_id', course_id), + ('course_code', course_code), + ('campus', 'UTSG'), + ('period', period), + ('date', date), + ('start_time', start), + ('end_time', end), + ('duration', duration), + ('sections', exam_sections) + ]) + + if save: + for id_, doc in exams.items(): + Scraper.save_json(doc, location, id_) + + Scraper.logger.info('EngExams completed.') + return exams + + @staticmethod + def get_course_info(course, period): + endings = { + 'dec': {'season': 'F', 'month': '1'}, + 'apr': {'season': 'S', 'month': '1'}, + 'june': {'season': 'F', 'month': '5F'}, + 'aug': {'season': 'S', 'month': '5S'} + } + + month, year = period[:-2].lower(), period[-2:] + exam_id = course_id = course_code = None + if month in endings: + course_code = '%s%s' % (course, endings[month]['season']) + course_id = '%s20%s%s' % (course_code, year, endings[month]['month']) + exam_id = '%s%s' % (course_id, period) + return exam_id, course_id, course_code + + @staticmethod + def get_period(d): + def get_date(month, date, year): + month = 'jun' if month == 'june' else month + return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') + + d = datetime.strptime(d, '%Y-%m-%d') + + year = d.year + month = None + + for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): + if get_date(m, 1, year) <= d <= get_date(m, ld, year): + month = m + break + + if month: + return '%s%s' % (month.upper(), str(year)[2:]) From 4913694daa0615e1a2811e9c79941f0236cbfb1b Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 1 May 2016 13:43:51 -0400 Subject: [PATCH 5/6] Oops --- uoftscrapers/scrapers/exams/utsg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 911c759..1b0f108 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -279,7 +279,7 @@ def scrape(location='.', year=None, save=True): @staticmethod def get_course_info(course, period): endings = { - 'dec': {'season': 'F', 'month': '1'}, + 'dec': {'season': 'F', 'month': '9'}, 'apr': {'season': 'S', 'month': '1'}, 'june': {'season': 'F', 'month': '5F'}, 'aug': {'season': 'S', 'month': '5S'} From 4176d900b8c71ffd9cf69adecb3c45c2f33cd62c Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 1 May 2016 17:26:01 -0400 Subject: [PATCH 6/6] Modularize common helpers --- uoftscrapers/scrapers/exams/exams_helpers.py | 64 +++++++++++++++ uoftscrapers/scrapers/exams/utm.py | 79 +++--------------- uoftscrapers/scrapers/exams/utsc.py | 84 ++++---------------- uoftscrapers/scrapers/exams/utsg.py | 43 +++------- 4 files changed, 99 insertions(+), 171 deletions(-) create mode 100644 uoftscrapers/scrapers/exams/exams_helpers.py diff --git a/uoftscrapers/scrapers/exams/exams_helpers.py b/uoftscrapers/scrapers/exams/exams_helpers.py new file mode 100644 index 0000000..90b6f93 --- /dev/null +++ b/uoftscrapers/scrapers/exams/exams_helpers.py @@ -0,0 +1,64 @@ +from datetime import datetime + + +def convert_time(t): + t = [int(x) for x in t.split(':')] + + converted = 0 + for i in range(min(len(t), 3)): + converted += t[i] * (60 ** (2-i)) + return converted + + +def get_period(d): + def get_date(month, date, year): + month = 'jun' if month == 'june' else month + return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') + + d = datetime.strptime(d, '%Y-%m-%d') + year, month = d.year, None + + for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): + if get_date(m, 1, year) <= d <= get_date(m, ld, year): + month = m + break + + if month: + return '%s%s' % (month.upper(), str(year)[2:]) + + +def get_course_id(course_code, date): + d = datetime.strptime(date, '%Y-%m-%d') + + month, year = d.strftime('%b').lower(), d.year + month = 'june' if month == 'jun' else month + + endings = { + 'dec': { + 'F': '%d9' % year, + 'Y': '%d9' % (year - 1) + }, + 'apr': { + 'S': '%d1' % year, + 'Y': '%d9' % (year - 1) + }, + 'june': { + 'F': '%d5F' % year, + 'Y': '%d5' % year + }, + 'aug': { + 'S': '%d5S' % year, + 'Y': '%d5' % year + } + } + + season = course_code[-1] + period = get_period(date) + + exam_id = course_id = None + + if month in endings and season in endings[month]: + course_id = '%s%s' % (course_code, endings[month][season]) + exam_id = '%s%s' % (course_id, period) + + return exam_id, course_id diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py index c07880b..e42e545 100644 --- a/uoftscrapers/scrapers/exams/utm.py +++ b/uoftscrapers/scrapers/exams/utm.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .exams_helpers import * from bs4 import BeautifulSoup from collections import OrderedDict from datetime import datetime @@ -33,7 +34,6 @@ def scrape(location='.'): @staticmethod def retrieve_exams(courses): - exams = OrderedDict() for course in courses: @@ -56,15 +56,15 @@ def retrieve_exams(courses): date = data[0].split(': ')[1] - id_, course_id = UTMExams.get_course_id(course_code, date) + exam_id, course_id = get_course_id(course_code, date) - period = UTMExams.get_period(date) + period = get_period(date) - if not id_ or not period: + if not exam_id or not period: continue - start, end = UTMExams.parse_time(data[1].split(': ')[1], - data[2].split(': ')[1], date) + start = convert_time(data[1].split(': ')[1]) + end = convert_time(data[2].split(': ')[1]) duration = end - start sections = [UTMExams.parse_sections(room.split(': ')[1]) @@ -75,7 +75,7 @@ def retrieve_exams(courses): sections[i]['lecture'] = lecture_code or '' doc = OrderedDict([ - ('id', id_), + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTM'), @@ -87,11 +87,11 @@ def retrieve_exams(courses): ('sections', []) ]) - if id_ not in exams: - exams[id_] = doc + if exam_id not in exams: + exams[exam_id] = doc for section in sections: - exams[id_]['sections'].append(OrderedDict([ + exams[exam_id]['sections'].append(OrderedDict([ ('lecture_code', section['lecture']), ('exam_section', section['section']), ('location', section['room']) @@ -109,58 +109,6 @@ def get_page_links(endpoint): return [li.find('a')['href'] for li in soup.find('ul', class_='link').find_all('li')] - @staticmethod - def get_period(d): - def get_date(month, date, year): - month = 'jun' if month == 'june' else month - return datetime.strptime('%s %s %d' % (year, month, date), - '%Y %b %d') - - d = datetime.strptime(d, '%Y-%m-%d') - - year = d.year - month = None - - for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): - if get_date(m, 1, year) <= d <= get_date(m, ld, year): - month = m - break - - if month: - return '%s%s' % (month.upper(), str(year)[2:]) - - @staticmethod - def get_course_id(course_code, date): - d = datetime.strptime(date, '%Y-%m-%d') - month, year, period = d.strftime('%b').lower(), d.year, UTMExams.get_period(date) - endings = { - 'dec': { - 'F': '%s9' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'apr': { - 'S': '%s1' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'june': { - 'F': '%s5F' % str(year), - 'Y': '%s5' % str(year) - }, - 'aug': { - 'S': '%s5S' % str(year), - 'Y': '%s5' % str(year) - } - } - - season = course_code[-1] - exam_id = course_id = None - - if month in endings and season in endings[month]: - course_id = '%s%s' % (course_code, endings[month][season]) - exam_id = '%s%s' % (course_id, period) - - return exam_id, course_id - @staticmethod def parse_sections(room): section = '' @@ -168,10 +116,3 @@ def parse_sections(room): room, section = [x.strip() for x in re.sub('[()]', ' ', room).split(' ')] return {'section': section, 'room': room} - - @staticmethod - def parse_time(start, end, date): - def convert_time(t): - h, m, s = [int(x) for x in t.split(':')] - return (h * 60 * 60) + (m * 60) + s - return convert_time(start), convert_time(end) diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py index 49cb680..a588e3b 100644 --- a/uoftscrapers/scrapers/exams/utsc.py +++ b/uoftscrapers/scrapers/exams/utsc.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .exams_helpers import * from bs4 import BeautifulSoup from collections import OrderedDict from datetime import datetime @@ -31,20 +32,22 @@ def scrape(location='.'): course_code, lecture_code = course_code.split(' ') date = data[1] - start, end = UTSCExams.parse_time(data[2], data[3], date) - duration = end - start - - location_ = data[4] - id_, course_id = UTSCExams.get_course_id(course_code, date) + exam_id, course_id = get_course_id(course_code, date) - period = UTSCExams.get_period(date) + period = get_period(date) - if not id_ or not period: + if not exam_id or not period: continue + start = convert_time(data[2]) + end = convert_time(data[3]) + duration = end - start + + location_ = data[4] + doc = OrderedDict([ - ('id', id_), + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTSC'), @@ -56,10 +59,10 @@ def scrape(location='.'): ('sections', []) ]) - if id_ not in exams: - exams[id_] = doc + if exam_id not in exams: + exams[exam_id] = doc - exams[id_]['sections'].append(OrderedDict([ + exams[exam_id]['sections'].append(OrderedDict([ ('lecture_code', lecture_code or ''), ('exam_section', ''), ('location', location_) @@ -69,62 +72,3 @@ def scrape(location='.'): Scraper.save_json(doc, location, id_) Scraper.logger.info('UTSCExams completed.') - - @staticmethod - def get_period(d): - def get_date(month, date, year): - month = 'jun' if month == 'june' else month - return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') - - d = datetime.strptime(d, '%Y-%m-%d') - - year = d.year - month = None - - for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): - if get_date(m, 1, year) <= d <= get_date(m, ld, year): - month = m - break - - if month: - return '%s%s' % (month.upper(), str(year)[2:]) - - @staticmethod - def get_course_id(course_code, date): - d = datetime.strptime(date, '%Y-%m-%d') - month, year, period = d.strftime( - "%b").lower(), d.year, UTSCExams.get_period(date) - endings = { - 'dec': { - 'F': '%s9' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'apr': { - 'S': '%s1' % str(year), - 'Y': '%s9' % str(int(year) - 1) - }, - 'june': { - 'F': '%s5F' % str(year), - 'Y': '%s5' % str(year) - }, - 'aug': { - 'S': '%s5S' % str(year), - 'Y': '%s5' % str(year) - } - } - - season = course_code[-1] - exam_id = course_id = None - - if month in endings and season in endings[month]: - course_id = '%s%s' % (course_code, endings[month][season]) - exam_id = '%s%s' % (course_id, period) - - return exam_id, course_id - - @staticmethod - def parse_time(start, end, date): - def convert_time(t): - h, m = [int(x) for x in t.split(':')] - return (h * 60 * 60) + (m * 60) - return convert_time(start), convert_time(end) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 1b0f108..1e24b0a 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -1,4 +1,5 @@ from ..utils import Scraper +from .exams_helpers import * from bs4 import BeautifulSoup from collections import OrderedDict from datetime import datetime @@ -64,9 +65,10 @@ def scrape(location='.', year=None, save=True): for row in rows[1:]: data = [x.text.strip() for x in row.find_all('td')] - id_, course_id, course_code = ArtSciExams.parse_course_info(p, data[0]) + exam_id, course_id, course_code = \ + ArtSciExams.parse_course_info(p, data[0]) - if id_ is None: + if exam_id is None: continue section = data[1] @@ -87,7 +89,7 @@ def scrape(location='.', year=None, save=True): duration = end - start doc = OrderedDict([ - ('id', id_), + ('id', exam_id), ('course_id', course_id), ('course_code', course_code), ('campus', 'UTSG'), @@ -99,10 +101,10 @@ def scrape(location='.', year=None, save=True): ('sections', []) ]) - if id_ not in exams: - exams[id_] = doc + if exam_id not in exams: + exams[exam_id] = doc - exams[id_]['sections'].append(OrderedDict([ + exams[exam_id]['sections'].append(OrderedDict([ ('lecture_code', lecture_section or ''), ('exam_section', exam_section or ''), ('location', location_) @@ -149,12 +151,8 @@ def parse_course_info(period, course_code): @staticmethod def parse_date(date, year): """Convert date of form `D DD MMM` to ISO 8601 format.""" - - date = date.split(' ') - if len(date) == 3: - day, date, month = date - - return datetime.strptime('%s %s %s %s' % (day, date, month, year), + if date.count(' ') == 2: + return datetime.strptime('%s %s' % (date, year), '%a %d %b %y').date().isoformat() @staticmethod @@ -239,7 +237,7 @@ def scrape(location='.', year=None, save=True): duration = 2 * 60 * 60 + 30 * 60 end = start + duration - period = EngExams.get_period(date) + period = get_period(date) exam_id, course_id, course_code = \ EngExams.get_course_info(info.find('strong').text.strip(), period) @@ -292,22 +290,3 @@ def get_course_info(course, period): course_id = '%s20%s%s' % (course_code, year, endings[month]['month']) exam_id = '%s%s' % (course_id, period) return exam_id, course_id, course_code - - @staticmethod - def get_period(d): - def get_date(month, date, year): - month = 'jun' if month == 'june' else month - return datetime.strptime('%s%s%d' % (year, month, date), '%Y%b%d') - - d = datetime.strptime(d, '%Y-%m-%d') - - year = d.year - month = None - - for m, ld in (('dec', 31), ('apr', 30), ('june', 30), ('aug', 31)): - if get_date(m, 1, year) <= d <= get_date(m, ld, year): - month = m - break - - if month: - return '%s%s' % (month.upper(), str(year)[2:])