From 26d00713f13dc85f6366a13f5c2fdebe2e54cfb1 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 11 May 2016 19:23:44 -0400 Subject: [PATCH 01/10] Initialize UTSG Dates --- uoftscrapers/__init__.py | 2 + uoftscrapers/scrapers/dates/__init__.py | 11 +++++ uoftscrapers/scrapers/dates/utsg.py | 55 +++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 uoftscrapers/scrapers/dates/__init__.py create mode 100644 uoftscrapers/scrapers/dates/utsg.py diff --git a/uoftscrapers/__init__.py b/uoftscrapers/__init__.py index 3c67031..854290d 100644 --- a/uoftscrapers/__init__.py +++ b/uoftscrapers/__init__.py @@ -38,6 +38,8 @@ from .scrapers.libraries import Libraries +from .scrapers.dates import Dates + class NullHandler(logging.Handler): def emit(self, record): diff --git a/uoftscrapers/scrapers/dates/__init__.py b/uoftscrapers/scrapers/dates/__init__.py new file mode 100644 index 0000000..f434b7a --- /dev/null +++ b/uoftscrapers/scrapers/dates/__init__.py @@ -0,0 +1,11 @@ +from ..utils import Scraper +from .utsg import UTSGDates + + +class Dates: + + @staticmethod + def scrape(location='.'): + Scraper.logger.info('Dates initialized.') + UTSGDates.scrape(location) + Scraper.logger.info('Dates completed.') diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py new file mode 100644 index 0000000..375f702 --- /dev/null +++ b/uoftscrapers/scrapers/dates/utsg.py @@ -0,0 +1,55 @@ +from ..utils import Scraper +from bs4 import BeautifulSoup +from collections import OrderedDict +from datetime import datetime +from pytz import timezone + + +class UTSGDates: + """A scraper for UTSG important dates. + + Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/. + """ + + @staticmethod + def scrape(location='.'): + Scraper.logger.info('UTSGDates initialized.') + + for faculty in ArtSciDates, EngDates: + dates = faculty.scrape(location) + if dates is not None: + # save json file + pass + + Scraper.logger.info('UTSGDates completed.') + + +class ArtSciDates: + """A scraper for important dates for the Faculty of Arts & Science. + + Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/. + """ + + @staticmethod + def scrape(location='.', year=None): + """Update the local JSON files for this scraper.""" + Scraper.logger.info('ArtSciDates initialized.') + + year = year[2:] or datetime.now().strftime('%y') + + Scraper.logger.info('ArtSciDates completed.') + + +class EngDates: + """A scraper for important dates for UTSG Engineering. + + Data is retrieved from http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm. + """ + + @staticmethod + def scrape(location='.'): + """Update the local JSON files for this scraper.""" + Scraper.logger.info('EngDates initialized.') + + Scraper.logger.info('EngDates completed.') + From 79e118a9431822e295912da209e4d4ffea9f8226 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 11 May 2016 22:21:39 -0400 Subject: [PATCH 02/10] Add support for full schedules i.e. still requires parsing for `tentative` text :frowning: --- uoftscrapers/scrapers/dates/utsg.py | 144 +++++++++++++++++++++++++--- 1 file changed, 129 insertions(+), 15 deletions(-) diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py index 375f702..7b3aeb5 100644 --- a/uoftscrapers/scrapers/dates/utsg.py +++ b/uoftscrapers/scrapers/dates/utsg.py @@ -3,53 +3,167 @@ from collections import OrderedDict from datetime import datetime from pytz import timezone +from pprint import pprint +import re class UTSGDates: - """A scraper for UTSG important dates. - - Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/. - """ + """A scraper for UTSG important dates.""" @staticmethod def scrape(location='.'): Scraper.logger.info('UTSGDates initialized.') for faculty in ArtSciDates, EngDates: - dates = faculty.scrape(location) - if dates is not None: - # save json file - pass + docs = faculty.scrape(location, save=False) + if docs is not None: + for date, doc in docs.items(): + Scraper.save_json(doc, location, date) Scraper.logger.info('UTSGDates completed.') class ArtSciDates: - """A scraper for important dates for the Faculty of Arts & Science. + """A scraper for important dates for UTSG Arts & Science. - Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/. + Data is retrieved from + http://www.artsci.utoronto.ca/current/course/timetable/. """ + host = 'http://www.artsci.utoronto.ca/current/course/timetable/' + @staticmethod - def scrape(location='.', year=None): + def scrape(location='.', year=None, save=True): """Update the local JSON files for this scraper.""" Scraper.logger.info('ArtSciDates initialized.') - year = year[2:] or datetime.now().strftime('%y') + for session, endpoint in ArtSciDates.get_sessions(year)[:1]: + headers = { + 'Referer': ArtSciDates.host + } + html = Scraper.get('%s%s' % (ArtSciDates.host, endpoint), + headers=headers, + max_attempts=3) + + if html is None: + Scraper.logger.info('No data available for %s.' % session.upper) + continue + + docs = OrderedDict() + + soup = BeautifulSoup(html, 'html.parser') + for tr in soup.find(class_='vertical listing').find_all('tr'): + if tr.find('th'): + continue + + event = tr.find_all('td') + + start_date, end_date = ArtSciDates.parse_dates(event[0].text, session) + + events = [] + for t in event[1].text.split(';\n'): + events += ArtSciDates.normalize_text(t) + + doc = OrderedDict([ + ('start_date', start_date), + ('end_date', end_date), + ('session', session), + ('events', events) + ]) + + if start_date not in docs: + docs[start_date] = doc + else: + docs[start_date]['events'].extend(doc['events']) + + if save: + for date, doc in docs.items(): + Scraper.save_json(doc, location, date) Scraper.logger.info('ArtSciDates completed.') + return docs + + @staticmethod + def normalize_text(text): + text = re.sub(r'\s\s+', ' ', text).strip() + + if text == '': + return [] + + if '\n' in text and text[-2:] != '\n': + return text.split('\n') + + return [text] + + @staticmethod + def get_sessions(year): + try: + date = datetime(year=year) + except: + year = None + + if year is None: + year = datetime.now().strftime('%Y') + + shortened_year = str(year)[2:] + session = '%s%d_fw' % (shortened_year, int(shortened_year) + 1) + + fall = '%s/%s_fall_dates' % (session, str(year)) + winter = '%s/%d_winter_dates' % (session, int(year) + 1) + + summer = '%s5/dates' % year + + return [ + ('FALL%s' % shortened_year, fall), + ('WINTER%s' % shortened_year, winter), + ('SUMMER%s' % shortened_year, summer) + ] + + @staticmethod + def parse_dates(date, session): + def get_date(date_string): + # date_string in the form '%B %d' + month = date_string.split(' ')[0] + year = int(session[-2:]) + if 'FALL' in session and int(datetime.strptime(month, '%B').strftime('%m')) < 4: + year += 1 + + return '%s %d' % (date_string, year) + + start = end = None + if '-' in date: + # Date range + if ' - ' in date: + # e.g. December 21 - January 4 + date = date.split(' - ') + + start, end = get_date(date[0]), get_date(date[1]) + else: + # e.g. November 7-8 + month, days = date.split(' ') + days = days.split('-') + + start = get_date('%s %s' % (month, days[0])) + end = get_date('%s %s' % (month, days[1])) + else: + start = end = get_date(date) + + start = datetime.strptime(start, '%B %d %y').date().isoformat() + end = datetime.strptime(end, '%B %d %y').date().isoformat() + + return start, end class EngDates: """A scraper for important dates for UTSG Engineering. - Data is retrieved from http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm. + Data is retrieved from + http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm. """ @staticmethod - def scrape(location='.'): + def scrape(location='.', save=True): """Update the local JSON files for this scraper.""" Scraper.logger.info('EngDates initialized.') Scraper.logger.info('EngDates completed.') - From bdbaf616a462b17cc6aeadeda0e1b4d416c93839 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Fri, 13 May 2016 21:27:06 -0400 Subject: [PATCH 03/10] Only return when not saving --- uoftscrapers/scrapers/athletics/utm.py | 2 +- uoftscrapers/scrapers/athletics/utsc.py | 3 ++- uoftscrapers/scrapers/exams/utsg.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index 95663c2..0a689a9 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -74,4 +74,4 @@ def scrape(location='.', month=None, save=True): Scraper.save_json(doc, location, id_) Scraper.logger.info('UTMAthletics completed.') - return athletics + return athletics if not save else None diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index 5d6a34b..fd7b07f 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -72,4 +72,5 @@ def scrape(location='.', month=None, save=True): Scraper.save_json(doc, location, id_) Scraper.logger.info('UTSCAthletics completed.') - return athletics + + return athletics if not save else None diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 1e24b0a..668dae5 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -115,7 +115,7 @@ def scrape(location='.', year=None, save=True): Scraper.save_json(doc, location, id_) Scraper.logger.info('ArtSciExams completed.') - return exams + return exams if not save else None @staticmethod def parse_course_info(period, course_code): @@ -272,7 +272,7 @@ def scrape(location='.', year=None, save=True): Scraper.save_json(doc, location, id_) Scraper.logger.info('EngExams completed.') - return exams + return exams if not save else None @staticmethod def get_course_info(course, period): From 0d729f56f0137acf70f544f2a8e9bc359099873b Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Fri, 13 May 2016 21:28:44 -0400 Subject: [PATCH 04/10] Refactor date/session parsers --- uoftscrapers/scrapers/dates/utsg.py | 148 ++++++++++++++++------------ 1 file changed, 86 insertions(+), 62 deletions(-) diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py index 7b3aeb5..c9c9e53 100644 --- a/uoftscrapers/scrapers/dates/utsg.py +++ b/uoftscrapers/scrapers/dates/utsg.py @@ -37,121 +37,145 @@ def scrape(location='.', year=None, save=True): """Update the local JSON files for this scraper.""" Scraper.logger.info('ArtSciDates initialized.') - for session, endpoint in ArtSciDates.get_sessions(year)[:1]: - headers = { - 'Referer': ArtSciDates.host - } + docs = OrderedDict() + + for endpoint in ArtSciDates.get_endpoints(year): + headers = {'Referer': ArtSciDates.host} html = Scraper.get('%s%s' % (ArtSciDates.host, endpoint), headers=headers, max_attempts=3) if html is None: - Scraper.logger.info('No data available for %s.' % session.upper) + Scraper.logger.info('No data available for %s.' % endpoint.upper) continue - docs = OrderedDict() - soup = BeautifulSoup(html, 'html.parser') + + session = ArtSciDates.parse_session(soup) + for tr in soup.find(class_='vertical listing').find_all('tr'): if tr.find('th'): continue - event = tr.find_all('td') + data = tr.find_all('td') + + start, end = ArtSciDates.parse_dates(data[0].text, session) - start_date, end_date = ArtSciDates.parse_dates(event[0].text, session) + descriptions = [] + for t in data[1].text.split(';\n'): + descriptions += ArtSciDates.normalize_text(t) events = [] - for t in event[1].text.split(';\n'): - events += ArtSciDates.normalize_text(t) + for description in descriptions: + events.append(OrderedDict([ + ('end', end), + ('session', session.upper()), + ('campus', 'UTSG'), + ('description', description) + ])) doc = OrderedDict([ - ('start_date', start_date), - ('end_date', end_date), - ('session', session), - ('events', events) + ('date', start), + ('events', events), ]) - if start_date not in docs: - docs[start_date] = doc + if start not in docs: + docs[start] = doc else: - docs[start_date]['events'].extend(doc['events']) + docs[start]['events'].extend(doc['events']) if save: for date, doc in docs.items(): Scraper.save_json(doc, location, date) Scraper.logger.info('ArtSciDates completed.') - return docs + return docs if not save else None @staticmethod - def normalize_text(text): - text = re.sub(r'\s\s+', ' ', text).strip() - - if text == '': - return [] - - if '\n' in text and text[-2:] != '\n': - return text.split('\n') - - return [text] - - @staticmethod - def get_sessions(year): + def get_endpoints(year): try: date = datetime(year=year) except: year = None if year is None: - year = datetime.now().strftime('%Y') + year = datetime.now().strftime('%y') + + session = '%s%d_fw' % (year, int(year) + 1) + + endpoints = [] - shortened_year = str(year)[2:] - session = '%s%d_fw' % (shortened_year, int(shortened_year) + 1) + headers = {'Referer': ArtSciDates.host} + html = Scraper.get('%s%s' % (ArtSciDates.host, session), + headers=headers, + max_attempts=3) - fall = '%s/%s_fall_dates' % (session, str(year)) - winter = '%s/%d_winter_dates' % (session, int(year) + 1) + if html is None: + return endpoints - summer = '%s5/dates' % year + soup = BeautifulSoup(html, 'html.parser') - return [ - ('FALL%s' % shortened_year, fall), - ('WINTER%s' % shortened_year, winter), - ('SUMMER%s' % shortened_year, summer) - ] + for a in soup.find(id='portal-column-one').find_all('a'): + if a.has_attr('title') and 'important dates' in a['title'].lower(): + endpoints.append(a['href']) + + return ['%s/%s' % (session, a.split('/')[-1]) for a in endpoints] +\ + ['20%s5/dates' % year] + + @staticmethod + def parse_session(soup): + session = '' + if soup.find(id='parent-fieldname-title'): + session = soup.find(id='parent-fieldname-title').text + session = session.replace('Important Dates', '').replace(':', '') + else: + # TODO parse page title + pass + return session.strip() @staticmethod def parse_dates(date, session): - def get_date(date_string): - # date_string in the form '%B %d' - month = date_string.split(' ')[0] - year = int(session[-2:]) - if 'FALL' in session and int(datetime.strptime(month, '%B').strftime('%m')) < 4: - year += 1 - return '%s %d' % (date_string, year) + def get_full_date(partial_date): + """Convert a partial date of the form `B d` (e.g. November 8) + to the form `B d Y` (e.g. November 8 2016).""" + month, day = partial_date.split(' ') + year = session[:4] + return '%s %s %s' % (month, day, year) - start = end = None + date = date.replace(' to ', '-').replace('(tentative)', '').strip() if '-' in date: - # Date range + # Date range (e.g. December 21 - January 4 or November 7-8) if ' - ' in date: - # e.g. December 21 - January 4 date = date.split(' - ') - - start, end = get_date(date[0]), get_date(date[1]) + start, end = get_full_date(date[0]), get_full_date(date[1]) else: - # e.g. November 7-8 month, days = date.split(' ') days = days.split('-') - start = get_date('%s %s' % (month, days[0])) - end = get_date('%s %s' % (month, days[1])) + start, end = get_full_date('%s %s' % (month, days[0])),\ + get_full_date('%s %s' % (month, days[1])) else: - start = end = get_date(date) + start = end = get_full_date(date) + + return ArtSciDates.convert_date(start), ArtSciDates.convert_date(end) + + @staticmethod + def convert_date(date): + """Convert a date of form `B d Y` (eg. May 13 2016) to ISO-8601.""" + return datetime.strptime(date, '%B %d %Y').date().isoformat() + + @staticmethod + def normalize_text(text): + text = re.sub(r'\s\s+', ' ', text).strip() - start = datetime.strptime(start, '%B %d %y').date().isoformat() - end = datetime.strptime(end, '%B %d %y').date().isoformat() + if text == '': + return [] - return start, end + if '\n' in text and text[-2:] != '\n': + return text.split('\n') + + return [text] class EngDates: From 1ff4e07be2cf208204940c12d778e927481df004 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sat, 14 May 2016 01:59:01 -0400 Subject: [PATCH 05/10] Initialize EngDates scraper --- uoftscrapers/scrapers/dates/utsg.py | 82 ++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py index c9c9e53..09b61a7 100644 --- a/uoftscrapers/scrapers/dates/utsg.py +++ b/uoftscrapers/scrapers/dates/utsg.py @@ -4,6 +4,7 @@ from datetime import datetime from pytz import timezone from pprint import pprint +from time import sleep import re @@ -14,11 +15,13 @@ class UTSGDates: def scrape(location='.'): Scraper.logger.info('UTSGDates initialized.') - for faculty in ArtSciDates, EngDates: - docs = faculty.scrape(location, save=False) - if docs is not None: - for date, doc in docs.items(): - Scraper.save_json(doc, location, date) + # for faculty in ArtSciDates, EngDates: + # docs = faculty.scrape(location, save=False) + # if docs is not None: + # for date, doc in docs.items(): + # Scraper.save_json(doc, location, date) + + EngDates.scrape(location) Scraper.logger.info('UTSGDates completed.') @@ -119,7 +122,7 @@ def get_endpoints(year): if a.has_attr('title') and 'important dates' in a['title'].lower(): endpoints.append(a['href']) - return ['%s/%s' % (session, a.split('/')[-1]) for a in endpoints] +\ + return ['%s/%s' % (session, a.split('/')[-1]) for a in endpoints] + \ ['20%s5/dates' % year] @staticmethod @@ -153,7 +156,7 @@ def get_full_date(partial_date): month, days = date.split(' ') days = days.split('-') - start, end = get_full_date('%s %s' % (month, days[0])),\ + start, end = get_full_date('%s %s' % (month, days[0])), \ get_full_date('%s %s' % (month, days[1])) else: start = end = get_full_date(date) @@ -185,9 +188,72 @@ class EngDates: http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm. """ + host = 'http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm' + + FORM_DATA = { + 'viewstate': '__VIEWSTATE', + 'viewstate_generator': '__VIEWSTATEGENERATOR', + 'numerical_date': 'ctl02$ctlSelectedDate$hdnDateValueForQuestionnaireResponses', + 'textual_date': 'ctl02$ctlSelectedDate$txtDate' + } + @staticmethod - def scrape(location='.', save=True): + def scrape(location='.', year=None, save=True): """Update the local JSON files for this scraper.""" Scraper.logger.info('EngDates initialized.') + year = year or datetime.now().year + + viewstate, viewstate_generator, numerical_date, textual_date = \ + EngDates.FORM_DATA.values() + + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': EngDates.host, + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' + } + + s = Scraper.s + s.headers.update(headers) + + payload = {} + payload[viewstate], payload[viewstate_generator] = \ + EngDates.get_viewstate(s) + + for i in range(1, 13): + month = datetime.strptime(str(i), '%m').strftime('%B') + payload[textual_date] = '%s 15 %s' % (month, year) + + payload[numerical_date] = '%s.%s.15' % (year, str(i).zfill(2)) + + attempts = 0 + + html = s.post(EngDates.host, data=payload).text.encode('utf-8') or '' + soup = BeautifulSoup(html, 'html.parser') + + while attempts < 5 and soup.find('div', class_='error'): + print('attempt %d' % attempts) + + html = s.post(EngDates.host, data=payload).text.encode('utf-8') or '' + soup = BeautifulSoup(html, 'html.parser') + + print(soup.find('div', class_='error')) + + attempts += 1 + sleep(1) + + if not html or soup.find('div', class_='error'): + continue + Scraper.logger.info('EngDates completed.') + + @staticmethod + def get_viewstate(s): + html = s.get(EngDates.host) + soup = BeautifulSoup(html.content, 'html.parser') + + return soup.find(id='__VIEWSTATE')['value'],\ + soup.find(id='__VIEWSTATEGENERATOR')['value'] From f74dcf724a0cc48ad7ca3758d4e9483ddba90822 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 15 May 2016 20:17:52 -0400 Subject: [PATCH 06/10] Move UTMDates scraper --- uoftscrapers/scrapers/calendar/utm.py | 70 ++---------------------- uoftscrapers/scrapers/dates/utm.py | 79 +++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 66 deletions(-) create mode 100644 uoftscrapers/scrapers/dates/utm.py diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index b90b4aa..d5bd387 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -4,76 +4,14 @@ import json import os import requests -import datetime class UTMCalendar: - '''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates - ''' - link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header=' - sessionNumber = [5, 9] - @staticmethod - def scrape(location='.', year=None): #scrapes most current sessions by default - - year = year or datetime.datetime.now().year + host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/' - currentSession = "{0} SUMMER" - calendar = OrderedDict() + @staticmethod + def scrape(location='.'): Scraper.logger.info('UTMCalendar initialized.') - for session in UTMCalendar.sessionNumber: - html = Scraper.get(UTMCalendar.link.format(year, session)) - soup = BeautifulSoup(html, 'html.parser') - content = soup.find('div', class_='content') - dates = content.find_all('div', class_='title') - i = 0 - currentDate = dates[i] - while(i1: - eventEnd = UTMCalendar.convert_date(eventStartEnd[1].strip()) - else: - eventEnd = eventStart - - events.append(OrderedDict([ - ('end_date', eventEnd), - ('session', currentSession.format(UTMCalendar.get_year_from(eventStart))), - ('campus', 'UTM'), - ('description', description) - ])) - i+=1 - if(i>=len(dates)): - break; - calendar[date] = OrderedDict([ - ('date', eventStart), - ('events', events) - ]) - if(i1: + eventEnd = UTMDates.convert_date(eventStartEnd[1].strip()) + else: + eventEnd = eventStart + + events.append(OrderedDict([ + ('end_date', eventEnd), + ('session', currentSession.format(UTMDates.get_year_from(eventStart))), + ('campus', 'UTM'), + ('description', description) + ])) + i+=1 + if(i>=len(dates)): + break; + calendar[date] = OrderedDict([ + ('date', eventStart), + ('events', events) + ]) + if(i Date: Sun, 15 May 2016 20:40:22 -0400 Subject: [PATCH 07/10] PEP 8 --- uoftscrapers/scrapers/dates/utm.py | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/uoftscrapers/scrapers/dates/utm.py b/uoftscrapers/scrapers/dates/utm.py index b3d960d..2b158b7 100644 --- a/uoftscrapers/scrapers/dates/utm.py +++ b/uoftscrapers/scrapers/dates/utm.py @@ -14,7 +14,7 @@ class UTMDates: link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header=' sessionNumber = [5, 9] @staticmethod - def scrape(location='.', year=None): #scrapes most current sessions by default + def scrape(location='.', year=None, save=True): # scrapes most current sessions by default year = year or datetime.datetime.now().year @@ -28,15 +28,15 @@ def scrape(location='.', year=None): #scrapes most current sessions by default dates = content.find_all('div', class_='title') i = 0 currentDate = dates[i] - while(i1: + if len(eventStartEnd) > 1: eventEnd = UTMDates.convert_date(eventStartEnd[1].strip()) else: eventEnd = eventStart @@ -47,23 +47,23 @@ def scrape(location='.', year=None): #scrapes most current sessions by default ('campus', 'UTM'), ('description', description) ])) - i+=1 - if(i>=len(dates)): - break; - calendar[date] = OrderedDict([ - ('date', eventStart), - ('events', events) - ]) - if(i= len(dates)): + break + calendar[eventStart] = OrderedDict([ + ('date', eventStart), + ('events', events) + ]) + if(i < len(dates)): currentDate = dates[i] currentSession = "{0} FALL/WINTER" - - for date, info in calendar.items(): - Scraper.save_json(info, location, UTMDates.convert_date(date)) + if save: + for date, info in calendar.items(): + Scraper.save_json(info, location, UTMDates.convert_date(date)) Scraper.logger.info('UTMDates completed.') - return calendar + return calendar if not save else None @staticmethod def convert_date(date): From fa4dfcbada87bae5085a6f2226693924283d8883 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 15 May 2016 20:43:01 -0400 Subject: [PATCH 08/10] Add EngDates scraper --- uoftscrapers/scrapers/dates/utsg.py | 102 ++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 20 deletions(-) diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py index 09b61a7..f0734e7 100644 --- a/uoftscrapers/scrapers/dates/utsg.py +++ b/uoftscrapers/scrapers/dates/utsg.py @@ -3,27 +3,41 @@ from collections import OrderedDict from datetime import datetime from pytz import timezone -from pprint import pprint from time import sleep import re +import http.cookiejar class UTSGDates: """A scraper for UTSG important dates.""" @staticmethod - def scrape(location='.'): + def scrape(location='.', year=None, save=True): Scraper.logger.info('UTSGDates initialized.') - # for faculty in ArtSciDates, EngDates: - # docs = faculty.scrape(location, save=False) - # if docs is not None: - # for date, doc in docs.items(): - # Scraper.save_json(doc, location, date) + docs = OrderedDict() + + for faculty in ArtSciDates, EngDates: + dates = faculty.scrape(location, year=year, save=False) + + if dates is None: + continue + + for date, doc in dates.items(): + if date not in docs: + docs[date] = OrderedDict([ + ('date', date), + ('events', []) + ]) - EngDates.scrape(location) + docs[date]['events'].extend(doc['events']) + + if save: + for date, doc in docs.items(): + Scraper.save_json(doc, location, date) Scraper.logger.info('UTSGDates completed.') + return docs if not save else None class ArtSciDates: @@ -190,7 +204,9 @@ class EngDates: host = 'http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm' - FORM_DATA = { + cookies = http.cookiejar.CookieJar() + + form_data = { 'viewstate': '__VIEWSTATE', 'viewstate_generator': '__VIEWSTATEGENERATOR', 'numerical_date': 'ctl02$ctlSelectedDate$hdnDateValueForQuestionnaireResponses', @@ -205,7 +221,7 @@ def scrape(location='.', year=None, save=True): year = year or datetime.now().year viewstate, viewstate_generator, numerical_date, textual_date = \ - EngDates.FORM_DATA.values() + EngDates.form_data.values() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', @@ -223,36 +239,82 @@ def scrape(location='.', year=None, save=True): payload[viewstate], payload[viewstate_generator] = \ EngDates.get_viewstate(s) - for i in range(1, 13): - month = datetime.strptime(str(i), '%m').strftime('%B') - payload[textual_date] = '%s 15 %s' % (month, year) + docs = OrderedDict() + + for m in range(1, 13): + Scraper.logger.info('Scraping month %d' % m) - payload[numerical_date] = '%s.%s.15' % (year, str(i).zfill(2)) + month = datetime.strptime(str(m), '%m').strftime('%B') + payload[textual_date] = '%s 15 %s' % (month, year) + payload[numerical_date] = '%s.%s.15' % (year, str(m).zfill(2)) attempts = 0 - html = s.post(EngDates.host, data=payload).text.encode('utf-8') or '' + html = s.post(EngDates.host, + data=payload, + cookies=EngDates.cookies).text.encode('utf-8') or '' soup = BeautifulSoup(html, 'html.parser') while attempts < 5 and soup.find('div', class_='error'): - print('attempt %d' % attempts) + Scraper.logger.info('Attempt #%d' % (attempts + 1)) - html = s.post(EngDates.host, data=payload).text.encode('utf-8') or '' + html = s.post(EngDates.host, + data=payload, + cookies=EngDates.cookies).text.encode('utf-8') or '' soup = BeautifulSoup(html, 'html.parser') - print(soup.find('div', class_='error')) - attempts += 1 sleep(1) if not html or soup.find('div', class_='error'): + Scraper.logger.info('Couldn\'t scrape month %d' % m) continue + for tr in soup.find(id='ctl02_ctlCalendar').find_all('tr')[2:]: + for td in tr.find_all('td'): + if not td.find('a') or not td.find('div').find('a'): + continue + + start = end = EngDates.get_date(td.find('a')['title'], year) + + session = '%d ENGINEERING' % year + + events = [] + for div in td.find_all('div'): + event = div.find('a') + + events.append(OrderedDict([ + ('end_date', end), + ('session', session), + ('campus', 'UTSG'), + ('description', event.text) + ])) + + if start not in docs: + docs[start] = OrderedDict([ + ('date', start), + ('events', events) + ]) + else: + docs[start]['events'].extend(events) + + if save: + for date, doc in docs.items(): + Scraper.save_json(doc, location, date) + Scraper.logger.info('EngDates completed.') + return docs if not save else None + + @staticmethod + def get_date(date, year): + """Return a IS0 8601 date from a date string of the form `M d`""" + date = '%s %s' % (year, date) + return datetime.strptime(date, '%Y %B %d').date().isoformat() @staticmethod def get_viewstate(s): - html = s.get(EngDates.host) + headers = {'Referer': EngDates.host} + html = s.get(EngDates.host, headers=headers) soup = BeautifulSoup(html.content, 'html.parser') return soup.find(id='__VIEWSTATE')['value'],\ From e5834bd7875d1acbf6fe6d44b1a9d5080cedb5c1 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 15 May 2016 20:43:14 -0400 Subject: [PATCH 09/10] Merge UTM/UTSG docs --- uoftscrapers/scrapers/dates/__init__.py | 26 +++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/uoftscrapers/scrapers/dates/__init__.py b/uoftscrapers/scrapers/dates/__init__.py index f434b7a..9432e1a 100644 --- a/uoftscrapers/scrapers/dates/__init__.py +++ b/uoftscrapers/scrapers/dates/__init__.py @@ -1,11 +1,33 @@ from ..utils import Scraper from .utsg import UTSGDates +from .utm import UTMDates + +from collections import OrderedDict class Dates: @staticmethod - def scrape(location='.'): + def scrape(location='.', year=None): Scraper.logger.info('Dates initialized.') - UTSGDates.scrape(location) + + docs = OrderedDict() + + for campus in UTSGDates, UTMDates: + dates = campus.scrape(location, year=year, save=False) + + if dates is None: + continue + + for date, doc in dates.items(): + if date not in docs: + docs[date] = OrderedDict([ + ('date', date), + ('events', []) + ]) + docs[date]['events'].extend(doc['events']) + + for date, doc in docs.items(): + Scraper.save_json(doc, location, date) + Scraper.logger.info('Dates completed.') From 2a432e20db7ed1abf93314c65a3395d5614be57d Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Sun, 15 May 2016 20:43:22 -0400 Subject: [PATCH 10/10] Add Dates reference --- README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/README.md b/README.md index a8621eb..08ddf29 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,9 @@ This is a library of scrapers for various University of Toronto websites. It is - [Shuttle Bus Schedule](#shuttles) - [Events](#events) - [Libraries](#libraries) + - [Dates](#Dates) + - [UTSG Dates](#utsg-dates) + - [UTM Dates](#utm-dates) ## Requirements - [python3](https://www.python.org/download/releases/3.5.1) @@ -692,3 +695,62 @@ https://onesearch.library.utoronto.ca/ } } ``` + +-------------------------------------------------------------------------------- + +### Dates + +##### Class name +```python +uoftscrapers.Dates +``` + +##### Scraper source + - [UTSG Dates](#utsg-dates) + - [UTM Dates](#utm-dates) + +##### Output format +```js +{ + "date": String, + "events": [{ + "end_date": String, + "session": String, + "campus": String, + "description": String + }] +} +``` + +---------------------------------------- + +### UTSG Dates + +##### Class name +```python +uoftscrapers.UTSGDates +``` + +##### Scraper source +http://www.artsci.utoronto.ca/current/course/timetable/ +http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm + +##### Output format +Refer to [Exams](#exams) + +-------------------- + +### UTM Dates + +##### Class name +```python +uoftscrapers.UTMDates +``` + +##### Scraper source +http://m.utm.utoronto.ca/importantDates.php + +##### Output format +Refer to [Exams](#exams) + +--------------------------------------------------------------------------------