From 13fdda031e17d18fe80d379ea35eea3ac995fa96 Mon Sep 17 00:00:00 2001 From: Anderson Ng Ho Yin Date: Thu, 12 May 2016 15:48:26 -0400 Subject: [PATCH 1/9] initial commit added initial scraping code --- uoftscrapers/scrapers/calendar/utm.py | 33 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index d5bd387..752cff7 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -4,14 +4,39 @@ import json import os import requests +import datetime +now = datetime.datetime.now() class UTMCalendar: - host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/' + host1 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' + host2 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' @staticmethod - def scrape(location='.'): + def scrape(location='.', year=None): + year = year or now.year + Scraper.logger.info('UTMCalendar initialized.') - Scraper.logger.info('Not implemented.') - Scraper.logger.info('UTMCalendar completed.') + + html = Scraper.get(UTMCalendar.host1.format(year)) + soup = BeautifulSoup(html, 'html.parser') + content = soup.find('div', class_='content') + dates = content.find_all('div', class_='title') + i = 0 + currentDate = dates[i] + while(i=len(dates)): + break; + if(i Date: Thu, 12 May 2016 15:48:47 -0400 Subject: [PATCH 2/9] Revert "initial commit" This reverts commit 13fdda031e17d18fe80d379ea35eea3ac995fa96. --- uoftscrapers/scrapers/calendar/utm.py | 33 ++++----------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 752cff7..d5bd387 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -4,39 +4,14 @@ import json import os import requests -import datetime -now = datetime.datetime.now() class UTMCalendar: - host1 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' - host2 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' + host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/' @staticmethod - def scrape(location='.', year=None): - year = year or now.year - + def scrape(location='.'): Scraper.logger.info('UTMCalendar initialized.') - - html = Scraper.get(UTMCalendar.host1.format(year)) - soup = BeautifulSoup(html, 'html.parser') - content = soup.find('div', class_='content') - dates = content.find_all('div', class_='title') - i = 0 - currentDate = dates[i] - while(i=len(dates)): - break; - if(i Date: Thu, 12 May 2016 16:22:01 -0400 Subject: [PATCH 3/9] initial commit Added basic scraping code --- uoftscrapers/scrapers/calendar/utm.py | 33 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index d5bd387..752cff7 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -4,14 +4,39 @@ import json import os import requests +import datetime +now = datetime.datetime.now() class UTMCalendar: - host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/' + host1 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' + host2 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' @staticmethod - def scrape(location='.'): + def scrape(location='.', year=None): + year = year or now.year + Scraper.logger.info('UTMCalendar initialized.') - Scraper.logger.info('Not implemented.') - Scraper.logger.info('UTMCalendar completed.') + + html = Scraper.get(UTMCalendar.host1.format(year)) + soup = BeautifulSoup(html, 'html.parser') + content = soup.find('div', class_='content') + dates = content.find_all('div', class_='title') + i = 0 + currentDate = dates[i] + while(i=len(dates)): + break; + if(i Date: Thu, 12 May 2016 17:41:41 -0400 Subject: [PATCH 4/9] Produced a functional scraper the scraper and now functional, but the JSON file names may have to be changed --- uoftscrapers/scrapers/calendar/utm.py | 58 ++++++++++++++++++--------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 752cff7..4b8d300 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -12,31 +12,53 @@ class UTMCalendar: host1 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' host2 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' + sessionLinks = [host1, host2] @staticmethod def scrape(location='.', year=None): year = year or now.year + calendar = OrderedDict() Scraper.logger.info('UTMCalendar initialized.') + for link in UTMCalendar.sessionLinks: + html = Scraper.get(link.format(year)) + soup = BeautifulSoup(html, 'html.parser') + content = soup.find('div', class_='content') + dates = content.find_all('div', class_='title') + i = 0 + currentDate = dates[i] + while(i1: + eventEnd = eventStartEnd[1].strip() + else: + eventEnd = eventStart - html = Scraper.get(UTMCalendar.host1.format(year)) - soup = BeautifulSoup(html, 'html.parser') - content = soup.find('div', class_='content') - dates = content.find_all('div', class_='title') - i = 0 - currentDate = dates[i] - while(i=len(dates)): + break; + calendar[date] = OrderedDict([ + ('date', eventStart), + ('session', "Summer"), + ('events', events) + ]) + if(i=len(dates)): - break; - if(i Date: Thu, 12 May 2016 17:52:25 -0400 Subject: [PATCH 5/9] fixed important dates session bug fixed bug and added more comments --- uoftscrapers/scrapers/calendar/utm.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 4b8d300..1330446 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -9,13 +9,16 @@ class UTMCalendar: + '''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates + ''' - host1 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' - host2 = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' - sessionLinks = [host1, host2] - + summerLink = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' + fallLink = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' + sessionLinks = [summerLink, fallLink] + currentSession = "Summer" @staticmethod - def scrape(location='.', year=None): + def scrape(location='.', year=None): #scrapes most current sessions by default + year = year or now.year calendar = OrderedDict() @@ -33,7 +36,7 @@ def scrape(location='.', year=None): while (currentDate == dates[i]): info = dates[i].find_next('div', class_='info') description = info.text - eventStartEnd = date.split('-') + eventStartEnd = date.split('-') #splits event dates over a period eventStart = eventStartEnd[0].strip() if len(eventStartEnd)>1: eventEnd = eventStartEnd[1].strip() @@ -50,12 +53,12 @@ def scrape(location='.', year=None): break; calendar[date] = OrderedDict([ ('date', eventStart), - ('session', "Summer"), + ('session', UTMCalendar.currentSession), ('events', events) ]) if(i Date: Thu, 12 May 2016 19:20:52 -0400 Subject: [PATCH 6/9] Updated JSON file names Update date format to match ISO 8601 format --- uoftscrapers/scrapers/calendar/utm.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 1330446..7ef385d 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -60,8 +60,19 @@ def scrape(location='.', year=None): #scrapes most current sessions by default currentDate = dates[i] UTMCalendar.currentSession = "Fall/Winter" + for date, info in calendar.items(): - Scraper.save_json(info, location, date) + Scraper.save_json(info, location, UTMCalendar.convert_date(date)) Scraper.logger.info('UTMCalendar completed.') - return calendar \ No newline at end of file + return calendar + + @staticmethod + def convert_date(date): + date_dict = {'January':'1', 'February':'2', 'March':'3', 'April':'4', 'May':'5', 'June':'6', 'July':'7', + 'August':'8', 'September':'9', 'October':'10', 'November':'11', 'December':'12'} + splitDate = date.split(' ') + year = splitDate[2] + day = splitDate[1].strip(',') + month = date_dict[splitDate[0]] + return("{0}-{1}-{2}".format(year, month, day)) \ No newline at end of file From d2d20825b46f9c39db88005b5557866ec9cecf77 Mon Sep 17 00:00:00 2001 From: Anderson Ng Ho Yin Date: Fri, 13 May 2016 14:35:00 -0400 Subject: [PATCH 7/9] Cleaned up scraper code Removed unnecessary lines and implemented suggested fixes --- uoftscrapers/scrapers/calendar/utm.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 7ef385d..92b57dd 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -5,26 +5,24 @@ import os import requests import datetime -now = datetime.datetime.now() class UTMCalendar: '''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates ''' - summerLink = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}5&header=' - fallLink = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}9&header=' - sessionLinks = [summerLink, fallLink] + link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header=' + sessionNumber = [5, 9] currentSession = "Summer" @staticmethod def scrape(location='.', year=None): #scrapes most current sessions by default - year = year or now.year + year = year or datetime.datetime.now() calendar = OrderedDict() Scraper.logger.info('UTMCalendar initialized.') - for link in UTMCalendar.sessionLinks: - html = Scraper.get(link.format(year)) + for session in UTMCalendar.sessionNumber: + html = Scraper.get(UTMCalendar.link.format(year, session)) soup = BeautifulSoup(html, 'html.parser') content = soup.find('div', class_='content') dates = content.find_all('div', class_='title') @@ -37,9 +35,9 @@ def scrape(location='.', year=None): #scrapes most current sessions by default info = dates[i].find_next('div', class_='info') description = info.text eventStartEnd = date.split('-') #splits event dates over a period - eventStart = eventStartEnd[0].strip() + eventStart = UTMCalendar.convert_date(eventStartEnd[0].strip()) if len(eventStartEnd)>1: - eventEnd = eventStartEnd[1].strip() + eventEnd = UTMCalendar.convert_date(eventStartEnd[1].strip()) else: eventEnd = eventStart @@ -69,10 +67,9 @@ def scrape(location='.', year=None): #scrapes most current sessions by default @staticmethod def convert_date(date): - date_dict = {'January':'1', 'February':'2', 'March':'3', 'April':'4', 'May':'5', 'June':'6', 'July':'7', - 'August':'8', 'September':'9', 'October':'10', 'November':'11', 'December':'12'} splitDate = date.split(' ') + print(splitDate) year = splitDate[2] day = splitDate[1].strip(',') - month = date_dict[splitDate[0]] - return("{0}-{1}-{2}".format(year, month, day)) \ No newline at end of file + month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m') + return("{0}-{1}-{2}".format(year, month, day.zfill(2))) \ No newline at end of file From c006dbe728e12b0e9ee86cbfd996cec3cf00fbd3 Mon Sep 17 00:00:00 2001 From: Anderson Ng Ho Yin Date: Fri, 13 May 2016 17:48:42 -0400 Subject: [PATCH 8/9] Fixed minor issues --- uoftscrapers/scrapers/calendar/utm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 92b57dd..71e8324 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -17,7 +17,7 @@ class UTMCalendar: @staticmethod def scrape(location='.', year=None): #scrapes most current sessions by default - year = year or datetime.datetime.now() + year = year or datetime.datetime.now().year calendar = OrderedDict() Scraper.logger.info('UTMCalendar initialized.') @@ -68,7 +68,6 @@ def scrape(location='.', year=None): #scrapes most current sessions by default @staticmethod def convert_date(date): splitDate = date.split(' ') - print(splitDate) year = splitDate[2] day = splitDate[1].strip(',') month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m') From 5bafb09aa5a9f01e8c1c93ae657bc2fe74a5be79 Mon Sep 17 00:00:00 2001 From: Anderson Ng Ho Yin Date: Sat, 14 May 2016 16:57:07 -0400 Subject: [PATCH 9/9] Updated JSON file format --- uoftscrapers/scrapers/calendar/utm.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py index 71e8324..b90b4aa 100644 --- a/uoftscrapers/scrapers/calendar/utm.py +++ b/uoftscrapers/scrapers/calendar/utm.py @@ -13,12 +13,12 @@ class UTMCalendar: link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header=' sessionNumber = [5, 9] - currentSession = "Summer" @staticmethod def scrape(location='.', year=None): #scrapes most current sessions by default year = year or datetime.datetime.now().year + currentSession = "{0} SUMMER" calendar = OrderedDict() Scraper.logger.info('UTMCalendar initialized.') for session in UTMCalendar.sessionNumber: @@ -43,6 +43,7 @@ def scrape(location='.', year=None): #scrapes most current sessions by default events.append(OrderedDict([ ('end_date', eventEnd), + ('session', currentSession.format(UTMCalendar.get_year_from(eventStart))), ('campus', 'UTM'), ('description', description) ])) @@ -51,12 +52,11 @@ def scrape(location='.', year=None): #scrapes most current sessions by default break; calendar[date] = OrderedDict([ ('date', eventStart), - ('session', UTMCalendar.currentSession), ('events', events) ]) if(i