From 26d00713f13dc85f6366a13f5c2fdebe2e54cfb1 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Wed, 11 May 2016 19:23:44 -0400
Subject: [PATCH 01/10] Initialize UTSG Dates

---
 uoftscrapers/__init__.py                |  2 +
 uoftscrapers/scrapers/dates/__init__.py | 11 +++++
 uoftscrapers/scrapers/dates/utsg.py     | 55 +++++++++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 uoftscrapers/scrapers/dates/__init__.py
 create mode 100644 uoftscrapers/scrapers/dates/utsg.py

diff --git a/uoftscrapers/__init__.py b/uoftscrapers/__init__.py
index 3c67031..854290d 100644
--- a/uoftscrapers/__init__.py
+++ b/uoftscrapers/__init__.py
@@ -38,6 +38,8 @@
 
 from .scrapers.libraries import Libraries
 
+from .scrapers.dates import Dates
+
 class NullHandler(logging.Handler):
 
     def emit(self, record):
diff --git a/uoftscrapers/scrapers/dates/__init__.py b/uoftscrapers/scrapers/dates/__init__.py
new file mode 100644
index 0000000..f434b7a
--- /dev/null
+++ b/uoftscrapers/scrapers/dates/__init__.py
@@ -0,0 +1,11 @@
+from ..utils import Scraper
+from .utsg import UTSGDates
+
+
+class Dates:
+
+    @staticmethod
+    def scrape(location='.'):
+        Scraper.logger.info('Dates initialized.')
+        UTSGDates.scrape(location)
+        Scraper.logger.info('Dates completed.')
diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py
new file mode 100644
index 0000000..375f702
--- /dev/null
+++ b/uoftscrapers/scrapers/dates/utsg.py
@@ -0,0 +1,55 @@
+from ..utils import Scraper
+from bs4 import BeautifulSoup
+from collections import OrderedDict
+from datetime import datetime
+from pytz import timezone
+
+
+class UTSGDates:
+    """A scraper for UTSG important dates.
+
+    Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/.
+    """
+
+    @staticmethod
+    def scrape(location='.'):
+        Scraper.logger.info('UTSGDates initialized.')
+
+        for faculty in ArtSciDates, EngDates:
+            dates = faculty.scrape(location)
+            if dates is not None:
+                # save json file
+                pass
+
+        Scraper.logger.info('UTSGDates completed.')
+
+
+class ArtSciDates:
+    """A scraper for important dates for the Faculty of Arts & Science.
+
+    Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/.
+    """
+
+    @staticmethod
+    def scrape(location='.', year=None):
+        """Update the local JSON files for this scraper."""
+        Scraper.logger.info('ArtSciDates initialized.')
+
+        year = year[2:] or datetime.now().strftime('%y')
+
+        Scraper.logger.info('ArtSciDates completed.')
+
+
+class EngDates:
+    """A scraper for important dates for UTSG Engineering.
+
+    Data is retrieved from http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm.
+    """
+
+    @staticmethod
+    def scrape(location='.'):
+        """Update the local JSON files for this scraper."""
+        Scraper.logger.info('EngDates initialized.')
+
+        Scraper.logger.info('EngDates completed.')
+

From 79e118a9431822e295912da209e4d4ffea9f8226 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Wed, 11 May 2016 22:21:39 -0400
Subject: [PATCH 02/10] Add support for full schedules

i.e. still requires parsing for `tentative` text :frowning:
---
 uoftscrapers/scrapers/dates/utsg.py | 144 +++++++++++++++++++++++++---
 1 file changed, 129 insertions(+), 15 deletions(-)

diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py
index 375f702..7b3aeb5 100644
--- a/uoftscrapers/scrapers/dates/utsg.py
+++ b/uoftscrapers/scrapers/dates/utsg.py
@@ -3,53 +3,167 @@
 from collections import OrderedDict
 from datetime import datetime
 from pytz import timezone
+from pprint import pprint
+import re
 
 
 class UTSGDates:
-    """A scraper for UTSG important dates.
-
-    Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/.
-    """
+    """A scraper for UTSG important dates."""
 
     @staticmethod
     def scrape(location='.'):
         Scraper.logger.info('UTSGDates initialized.')
 
         for faculty in ArtSciDates, EngDates:
-            dates = faculty.scrape(location)
-            if dates is not None:
-                # save json file
-                pass
+            docs = faculty.scrape(location, save=False)
+            if docs is not None:
+                for date, doc in docs.items():
+                    Scraper.save_json(doc, location, date)
 
         Scraper.logger.info('UTSGDates completed.')
 
 
 class ArtSciDates:
-    """A scraper for important dates for the Faculty of Arts & Science.
+    """A scraper for important dates for UTSG Arts & Science.
 
-    Data is retrieved from http://www.artsci.utoronto.ca/current/course/timetable/.
+    Data is retrieved from
+    http://www.artsci.utoronto.ca/current/course/timetable/.
     """
 
+    host = 'http://www.artsci.utoronto.ca/current/course/timetable/'
+
     @staticmethod
-    def scrape(location='.', year=None):
+    def scrape(location='.', year=None, save=True):
         """Update the local JSON files for this scraper."""
         Scraper.logger.info('ArtSciDates initialized.')
 
-        year = year[2:] or datetime.now().strftime('%y')
+        for session, endpoint in ArtSciDates.get_sessions(year)[:1]:
+            headers = {
+                'Referer': ArtSciDates.host
+            }
+            html = Scraper.get('%s%s' % (ArtSciDates.host, endpoint),
+                               headers=headers,
+                               max_attempts=3)
+
+            if html is None:
+                Scraper.logger.info('No data available for %s.' % session.upper)
+                continue
+
+            docs = OrderedDict()
+
+            soup = BeautifulSoup(html, 'html.parser')
+            for tr in soup.find(class_='vertical listing').find_all('tr'):
+                if tr.find('th'):
+                    continue
+
+                event = tr.find_all('td')
+
+                start_date, end_date = ArtSciDates.parse_dates(event[0].text, session)
+
+                events = []
+                for t in event[1].text.split(';\n'):
+                    events += ArtSciDates.normalize_text(t)
+
+                doc = OrderedDict([
+                    ('start_date', start_date),
+                    ('end_date', end_date),
+                    ('session', session),
+                    ('events', events)
+                ])
+
+                if start_date not in docs:
+                    docs[start_date] = doc
+                else:
+                    docs[start_date]['events'].extend(doc['events'])
+
+        if save:
+            for date, doc in docs.items():
+                Scraper.save_json(doc, location, date)
 
         Scraper.logger.info('ArtSciDates completed.')
+        return docs
+
+    @staticmethod
+    def normalize_text(text):
+        text = re.sub(r'\s\s+', ' ', text).strip()
+
+        if text == '':
+            return []
+
+        if '\n' in text and text[-2:] != '\n':
+            return text.split('\n')
+
+        return [text]
+
+    @staticmethod
+    def get_sessions(year):
+        try:
+            date = datetime(year=year)
+        except:
+            year = None
+
+        if year is None:
+            year = datetime.now().strftime('%Y')
+
+        shortened_year = str(year)[2:]
+        session = '%s%d_fw' % (shortened_year, int(shortened_year) + 1)
+
+        fall = '%s/%s_fall_dates' % (session, str(year))
+        winter = '%s/%d_winter_dates' % (session, int(year) + 1)
+
+        summer = '%s5/dates' % year
+
+        return [
+            ('FALL%s' % shortened_year, fall),
+            ('WINTER%s' % shortened_year, winter),
+            ('SUMMER%s' % shortened_year, summer)
+        ]
+
+    @staticmethod
+    def parse_dates(date, session):
+        def get_date(date_string):
+            # date_string in the form '%B %d'
+            month = date_string.split(' ')[0]
+            year = int(session[-2:])
+            if 'FALL' in session and int(datetime.strptime(month, '%B').strftime('%m')) < 4:
+                year += 1
+
+            return '%s %d' % (date_string, year)
+
+        start = end = None
+        if '-' in date:
+            # Date range
+            if ' - ' in date:
+                # e.g. December 21 - January 4
+                date = date.split(' - ')
+
+                start, end = get_date(date[0]), get_date(date[1])
+            else:
+                # e.g. November 7-8
+                month, days = date.split(' ')
+                days = days.split('-')
+
+                start = get_date('%s %s' % (month, days[0]))
+                end = get_date('%s %s' % (month, days[1]))
+        else:
+            start = end = get_date(date)
+
+        start = datetime.strptime(start, '%B %d %y').date().isoformat()
+        end = datetime.strptime(end, '%B %d %y').date().isoformat()
+
+        return start, end
 
 
 class EngDates:
     """A scraper for important dates for UTSG Engineering.
 
-    Data is retrieved from http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm.
+    Data is retrieved from
+    http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm.
     """
 
     @staticmethod
-    def scrape(location='.'):
+    def scrape(location='.', save=True):
         """Update the local JSON files for this scraper."""
         Scraper.logger.info('EngDates initialized.')
 
         Scraper.logger.info('EngDates completed.')
-

From bdbaf616a462b17cc6aeadeda0e1b4d416c93839 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Fri, 13 May 2016 21:27:06 -0400
Subject: [PATCH 03/10] Only return when not saving

---
 uoftscrapers/scrapers/athletics/utm.py  | 2 +-
 uoftscrapers/scrapers/athletics/utsc.py | 3 ++-
 uoftscrapers/scrapers/exams/utsg.py     | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py
index 95663c2..0a689a9 100644
--- a/uoftscrapers/scrapers/athletics/utm.py
+++ b/uoftscrapers/scrapers/athletics/utm.py
@@ -74,4 +74,4 @@ def scrape(location='.', month=None, save=True):
                 Scraper.save_json(doc, location, id_)
 
         Scraper.logger.info('UTMAthletics completed.')
-        return athletics
+        return athletics if not save else None
diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py
index 5d6a34b..fd7b07f 100644
--- a/uoftscrapers/scrapers/athletics/utsc.py
+++ b/uoftscrapers/scrapers/athletics/utsc.py
@@ -72,4 +72,5 @@ def scrape(location='.', month=None, save=True):
                 Scraper.save_json(doc, location, id_)
 
         Scraper.logger.info('UTSCAthletics completed.')
-        return athletics
+
+        return athletics if not save else None
diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py
index 1e24b0a..668dae5 100644
--- a/uoftscrapers/scrapers/exams/utsg.py
+++ b/uoftscrapers/scrapers/exams/utsg.py
@@ -115,7 +115,7 @@ def scrape(location='.', year=None, save=True):
                 Scraper.save_json(doc, location, id_)
 
         Scraper.logger.info('ArtSciExams completed.')
-        return exams
+        return exams if not save else None
 
     @staticmethod
     def parse_course_info(period, course_code):
@@ -272,7 +272,7 @@ def scrape(location='.', year=None, save=True):
                 Scraper.save_json(doc, location, id_)
 
         Scraper.logger.info('EngExams completed.')
-        return exams
+        return exams if not save else None
 
     @staticmethod
     def get_course_info(course, period):

From 0d729f56f0137acf70f544f2a8e9bc359099873b Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Fri, 13 May 2016 21:28:44 -0400
Subject: [PATCH 04/10] Refactor date/session parsers

---
 uoftscrapers/scrapers/dates/utsg.py | 148 ++++++++++++++++------------
 1 file changed, 86 insertions(+), 62 deletions(-)

diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py
index 7b3aeb5..c9c9e53 100644
--- a/uoftscrapers/scrapers/dates/utsg.py
+++ b/uoftscrapers/scrapers/dates/utsg.py
@@ -37,121 +37,145 @@ def scrape(location='.', year=None, save=True):
         """Update the local JSON files for this scraper."""
         Scraper.logger.info('ArtSciDates initialized.')
 
-        for session, endpoint in ArtSciDates.get_sessions(year)[:1]:
-            headers = {
-                'Referer': ArtSciDates.host
-            }
+        docs = OrderedDict()
+
+        for endpoint in ArtSciDates.get_endpoints(year):
+            headers = {'Referer': ArtSciDates.host}
             html = Scraper.get('%s%s' % (ArtSciDates.host, endpoint),
                                headers=headers,
                                max_attempts=3)
 
             if html is None:
-                Scraper.logger.info('No data available for %s.' % session.upper)
+                Scraper.logger.info('No data available for %s.' % endpoint.upper)
                 continue
 
-            docs = OrderedDict()
-
             soup = BeautifulSoup(html, 'html.parser')
+
+            session = ArtSciDates.parse_session(soup)
+
             for tr in soup.find(class_='vertical listing').find_all('tr'):
                 if tr.find('th'):
                     continue
 
-                event = tr.find_all('td')
+                data = tr.find_all('td')
+
+                start, end = ArtSciDates.parse_dates(data[0].text, session)
 
-                start_date, end_date = ArtSciDates.parse_dates(event[0].text, session)
+                descriptions = []
+                for t in data[1].text.split(';\n'):
+                    descriptions += ArtSciDates.normalize_text(t)
 
                 events = []
-                for t in event[1].text.split(';\n'):
-                    events += ArtSciDates.normalize_text(t)
+                for description in descriptions:
+                    events.append(OrderedDict([
+                        ('end', end),
+                        ('session', session.upper()),
+                        ('campus', 'UTSG'),
+                        ('description', description)
+                    ]))
 
                 doc = OrderedDict([
-                    ('start_date', start_date),
-                    ('end_date', end_date),
-                    ('session', session),
-                    ('events', events)
+                    ('date', start),
+                    ('events', events),
                 ])
 
-                if start_date not in docs:
-                    docs[start_date] = doc
+                if start not in docs:
+                    docs[start] = doc
                 else:
-                    docs[start_date]['events'].extend(doc['events'])
+                    docs[start]['events'].extend(doc['events'])
 
         if save:
             for date, doc in docs.items():
                 Scraper.save_json(doc, location, date)
 
         Scraper.logger.info('ArtSciDates completed.')
-        return docs
+        return docs if not save else None
 
     @staticmethod
-    def normalize_text(text):
-        text = re.sub(r'\s\s+', ' ', text).strip()
-
-        if text == '':
-            return []
-
-        if '\n' in text and text[-2:] != '\n':
-            return text.split('\n')
-
-        return [text]
-
-    @staticmethod
-    def get_sessions(year):
+    def get_endpoints(year):
         try:
             date = datetime(year=year)
         except:
             year = None
 
         if year is None:
-            year = datetime.now().strftime('%Y')
+            year = datetime.now().strftime('%y')
+
+        session = '%s%d_fw' % (year, int(year) + 1)
+
+        endpoints = []
 
-        shortened_year = str(year)[2:]
-        session = '%s%d_fw' % (shortened_year, int(shortened_year) + 1)
+        headers = {'Referer': ArtSciDates.host}
+        html = Scraper.get('%s%s' % (ArtSciDates.host, session),
+                           headers=headers,
+                           max_attempts=3)
 
-        fall = '%s/%s_fall_dates' % (session, str(year))
-        winter = '%s/%d_winter_dates' % (session, int(year) + 1)
+        if html is None:
+            return endpoints
 
-        summer = '%s5/dates' % year
+        soup = BeautifulSoup(html, 'html.parser')
 
-        return [
-            ('FALL%s' % shortened_year, fall),
-            ('WINTER%s' % shortened_year, winter),
-            ('SUMMER%s' % shortened_year, summer)
-        ]
+        for a in soup.find(id='portal-column-one').find_all('a'):
+            if a.has_attr('title') and 'important dates' in a['title'].lower():
+                endpoints.append(a['href'])
+
+        return ['%s/%s' % (session, a.split('/')[-1]) for a in endpoints] +\
+            ['20%s5/dates' % year]
+
+    @staticmethod
+    def parse_session(soup):
+        session = ''
+        if soup.find(id='parent-fieldname-title'):
+            session = soup.find(id='parent-fieldname-title').text
+            session = session.replace('Important Dates', '').replace(':', '')
+        else:
+            # TODO parse page title
+            pass
+        return session.strip()
 
     @staticmethod
     def parse_dates(date, session):
-        def get_date(date_string):
-            # date_string in the form '%B %d'
-            month = date_string.split(' ')[0]
-            year = int(session[-2:])
-            if 'FALL' in session and int(datetime.strptime(month, '%B').strftime('%m')) < 4:
-                year += 1
 
-            return '%s %d' % (date_string, year)
+        def get_full_date(partial_date):
+            """Convert a partial date of the form `B d` (e.g. November 8)
+            to the form `B d Y` (e.g. November 8 2016)."""
+            month, day = partial_date.split(' ')
+            year = session[:4]
+            return '%s %s %s' % (month, day, year)
 
-        start = end = None
+        date = date.replace(' to ', '-').replace('(tentative)', '').strip()
         if '-' in date:
-            # Date range
+            # Date range (e.g. December 21 - January 4 or November 7-8)
             if ' - ' in date:
-                # e.g. December 21 - January 4
                 date = date.split(' - ')
-
-                start, end = get_date(date[0]), get_date(date[1])
+                start, end = get_full_date(date[0]), get_full_date(date[1])
             else:
-                # e.g. November 7-8
                 month, days = date.split(' ')
                 days = days.split('-')
 
-                start = get_date('%s %s' % (month, days[0]))
-                end = get_date('%s %s' % (month, days[1]))
+                start, end = get_full_date('%s %s' % (month, days[0])),\
+                    get_full_date('%s %s' % (month, days[1]))
         else:
-            start = end = get_date(date)
+            start = end = get_full_date(date)
+
+        return ArtSciDates.convert_date(start), ArtSciDates.convert_date(end)
+
+    @staticmethod
+    def convert_date(date):
+        """Convert a date of form `B d Y` (eg. May 13 2016) to ISO-8601."""
+        return datetime.strptime(date, '%B %d %Y').date().isoformat()
+
+    @staticmethod
+    def normalize_text(text):
+        text = re.sub(r'\s\s+', ' ', text).strip()
 
-        start = datetime.strptime(start, '%B %d %y').date().isoformat()
-        end = datetime.strptime(end, '%B %d %y').date().isoformat()
+        if text == '':
+            return []
 
-        return start, end
+        if '\n' in text and text[-2:] != '\n':
+            return text.split('\n')
+
+        return [text]
 
 
 class EngDates:

From 1ff4e07be2cf208204940c12d778e927481df004 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Sat, 14 May 2016 01:59:01 -0400
Subject: [PATCH 05/10] Initialize EngDates scraper

---
 uoftscrapers/scrapers/dates/utsg.py | 82 ++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 8 deletions(-)

diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py
index c9c9e53..09b61a7 100644
--- a/uoftscrapers/scrapers/dates/utsg.py
+++ b/uoftscrapers/scrapers/dates/utsg.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 from pytz import timezone
 from pprint import pprint
+from time import sleep
 import re
 
 
@@ -14,11 +15,13 @@ class UTSGDates:
     def scrape(location='.'):
         Scraper.logger.info('UTSGDates initialized.')
 
-        for faculty in ArtSciDates, EngDates:
-            docs = faculty.scrape(location, save=False)
-            if docs is not None:
-                for date, doc in docs.items():
-                    Scraper.save_json(doc, location, date)
+        # for faculty in ArtSciDates, EngDates:
+        #     docs = faculty.scrape(location, save=False)
+        #     if docs is not None:
+        #         for date, doc in docs.items():
+        #             Scraper.save_json(doc, location, date)
+
+        EngDates.scrape(location)
 
         Scraper.logger.info('UTSGDates completed.')
 
@@ -119,7 +122,7 @@ def get_endpoints(year):
             if a.has_attr('title') and 'important dates' in a['title'].lower():
                 endpoints.append(a['href'])
 
-        return ['%s/%s' % (session, a.split('/')[-1]) for a in endpoints] +\
+        return ['%s/%s' % (session, a.split('/')[-1]) for a in endpoints] + \
             ['20%s5/dates' % year]
 
     @staticmethod
@@ -153,7 +156,7 @@ def get_full_date(partial_date):
                 month, days = date.split(' ')
                 days = days.split('-')
 
-                start, end = get_full_date('%s %s' % (month, days[0])),\
+                start, end = get_full_date('%s %s' % (month, days[0])), \
                     get_full_date('%s %s' % (month, days[1]))
         else:
             start = end = get_full_date(date)
@@ -185,9 +188,72 @@ class EngDates:
     http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm.
     """
 
+    host = 'http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm'
+
+    FORM_DATA = {
+        'viewstate': '__VIEWSTATE',
+        'viewstate_generator': '__VIEWSTATEGENERATOR',
+        'numerical_date': 'ctl02$ctlSelectedDate$hdnDateValueForQuestionnaireResponses',
+        'textual_date': 'ctl02$ctlSelectedDate$txtDate'
+    }
+
     @staticmethod
-    def scrape(location='.', save=True):
+    def scrape(location='.', year=None, save=True):
         """Update the local JSON files for this scraper."""
         Scraper.logger.info('EngDates initialized.')
 
+        year = year or datetime.now().year
+
+        viewstate, viewstate_generator, numerical_date, textual_date = \
+            EngDates.FORM_DATA.values()
+
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Referer': EngDates.host,
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
+        }
+
+        s = Scraper.s
+        s.headers.update(headers)
+
+        payload = {}
+        payload[viewstate], payload[viewstate_generator] = \
+            EngDates.get_viewstate(s)
+
+        for i in range(1, 13):
+            month = datetime.strptime(str(i), '%m').strftime('%B')
+            payload[textual_date] = '%s 15 %s' % (month, year)
+
+            payload[numerical_date] = '%s.%s.15' % (year, str(i).zfill(2))
+
+            attempts = 0
+
+            html = s.post(EngDates.host, data=payload).text.encode('utf-8') or ''
+            soup = BeautifulSoup(html, 'html.parser')
+
+            while attempts < 5 and soup.find('div', class_='error'):
+                print('attempt %d' % attempts)
+
+                html = s.post(EngDates.host, data=payload).text.encode('utf-8') or ''
+                soup = BeautifulSoup(html, 'html.parser')
+
+                print(soup.find('div', class_='error'))
+
+                attempts += 1
+                sleep(1)
+
+            if not html or soup.find('div', class_='error'):
+                continue
+
         Scraper.logger.info('EngDates completed.')
+
+    @staticmethod
+    def get_viewstate(s):
+        html = s.get(EngDates.host)
+        soup = BeautifulSoup(html.content, 'html.parser')
+
+        return soup.find(id='__VIEWSTATE')['value'],\
+            soup.find(id='__VIEWSTATEGENERATOR')['value']

From f74dcf724a0cc48ad7ca3758d4e9483ddba90822 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Sun, 15 May 2016 20:17:52 -0400
Subject: [PATCH 06/10] Move UTMDates scraper

---
 uoftscrapers/scrapers/calendar/utm.py | 70 ++----------------------
 uoftscrapers/scrapers/dates/utm.py    | 79 +++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 66 deletions(-)
 create mode 100644 uoftscrapers/scrapers/dates/utm.py

diff --git a/uoftscrapers/scrapers/calendar/utm.py b/uoftscrapers/scrapers/calendar/utm.py
index b90b4aa..d5bd387 100644
--- a/uoftscrapers/scrapers/calendar/utm.py
+++ b/uoftscrapers/scrapers/calendar/utm.py
@@ -4,76 +4,14 @@
 import json
 import os
 import requests
-import datetime
 
 
 class UTMCalendar:
-    '''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates
-        '''
 
-    link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
-    sessionNumber = [5, 9]
-    @staticmethod
-    def scrape(location='.', year=None): #scrapes most current sessions by default
-        
-        year = year or datetime.datetime.now().year
+    host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/'
 
-        currentSession = "{0} SUMMER"
-        calendar = OrderedDict()
+    @staticmethod
+    def scrape(location='.'):
         Scraper.logger.info('UTMCalendar initialized.')
-        for session in UTMCalendar.sessionNumber:
-            html = Scraper.get(UTMCalendar.link.format(year, session))
-            soup = BeautifulSoup(html, 'html.parser')
-            content = soup.find('div', class_='content')
-            dates = content.find_all('div', class_='title')
-            i = 0
-            currentDate = dates[i]
-            while(i<len(dates)):
-                date = dates[i].text
-                events = []
-                while (currentDate == dates[i]):
-                    info = dates[i].find_next('div', class_='info')
-                    description = info.text
-                    eventStartEnd = date.split('-') #splits event dates over a period
-                    eventStart = UTMCalendar.convert_date(eventStartEnd[0].strip())
-                    if len(eventStartEnd)>1:
-                        eventEnd = UTMCalendar.convert_date(eventStartEnd[1].strip())
-                    else:
-                        eventEnd = eventStart
-
-                    events.append(OrderedDict([
-                            ('end_date', eventEnd),
-                            ('session', currentSession.format(UTMCalendar.get_year_from(eventStart))),
-                            ('campus', 'UTM'),
-                            ('description', description)
-                        ]))
-                    i+=1
-                    if(i>=len(dates)):
-                        break;
-                calendar[date] = OrderedDict([
-                        ('date', eventStart),
-                        ('events', events)
-                    ])
-                if(i<len(dates)):
-                    currentDate = dates[i]
-            currentSession = "{0} FALL/WINTER"
-
-
-        for date, info in calendar.items():
-            Scraper.save_json(info, location, UTMCalendar.convert_date(date))
-
+        Scraper.logger.info('Not implemented.')
         Scraper.logger.info('UTMCalendar completed.')
-        return calendar
-
-    @staticmethod
-    def convert_date(date):
-        splitDate = date.split(' ')
-        year = splitDate[2]
-        day = splitDate[1].strip(',')
-        month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m')
-        return("{0}-{1}-{2}".format(year, month, day.zfill(2)))
-
-    @staticmethod
-    def get_year_from(date):
-        splitDate = date.split('-')
-        return splitDate[0]
\ No newline at end of file
diff --git a/uoftscrapers/scrapers/dates/utm.py b/uoftscrapers/scrapers/dates/utm.py
new file mode 100644
index 0000000..b3d960d
--- /dev/null
+++ b/uoftscrapers/scrapers/dates/utm.py
@@ -0,0 +1,79 @@
+from ..utils import Scraper
+from bs4 import BeautifulSoup
+from collections import OrderedDict
+import json
+import os
+import requests
+import datetime
+
+
+class UTMDates:
+    '''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates
+        '''
+
+    link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
+    sessionNumber = [5, 9]
+    @staticmethod
+    def scrape(location='.', year=None): #scrapes most current sessions by default
+
+        year = year or datetime.datetime.now().year
+
+        currentSession = "{0} SUMMER"
+        calendar = OrderedDict()
+        Scraper.logger.info('UTMDates initialized.')
+        for session in UTMDates.sessionNumber:
+            html = Scraper.get(UTMDates.link.format(year, session))
+            soup = BeautifulSoup(html, 'html.parser')
+            content = soup.find('div', class_='content')
+            dates = content.find_all('div', class_='title')
+            i = 0
+            currentDate = dates[i]
+            while(i<len(dates)):
+                date = dates[i].text
+                events = []
+                while (currentDate == dates[i]):
+                    info = dates[i].find_next('div', class_='info')
+                    description = info.text
+                    eventStartEnd = date.split('-') #splits event dates over a period
+                    eventStart = UTMDates.convert_date(eventStartEnd[0].strip())
+                    if len(eventStartEnd)>1:
+                        eventEnd = UTMDates.convert_date(eventStartEnd[1].strip())
+                    else:
+                        eventEnd = eventStart
+
+                    events.append(OrderedDict([
+                            ('end_date', eventEnd),
+                            ('session', currentSession.format(UTMDates.get_year_from(eventStart))),
+                            ('campus', 'UTM'),
+                            ('description', description)
+                        ]))
+                    i+=1
+                    if(i>=len(dates)):
+                        break;
+                calendar[date] = OrderedDict([
+                        ('date', eventStart),
+                        ('events', events)
+                    ])
+                if(i<len(dates)):
+                    currentDate = dates[i]
+            currentSession = "{0} FALL/WINTER"
+
+
+        for date, info in calendar.items():
+            Scraper.save_json(info, location, UTMDates.convert_date(date))
+
+        Scraper.logger.info('UTMDates completed.')
+        return calendar
+
+    @staticmethod
+    def convert_date(date):
+        splitDate = date.split(' ')
+        year = splitDate[2]
+        day = splitDate[1].strip(',')
+        month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m')
+        return("{0}-{1}-{2}".format(year, month, day.zfill(2)))
+
+    @staticmethod
+    def get_year_from(date):
+        splitDate = date.split('-')
+        return splitDate[0]

From 05545f308f9ab148894ae08f5ceec1b41965d7e8 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Sun, 15 May 2016 20:40:22 -0400
Subject: [PATCH 07/10] PEP 8

---
 uoftscrapers/scrapers/dates/utm.py | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/uoftscrapers/scrapers/dates/utm.py b/uoftscrapers/scrapers/dates/utm.py
index b3d960d..2b158b7 100644
--- a/uoftscrapers/scrapers/dates/utm.py
+++ b/uoftscrapers/scrapers/dates/utm.py
@@ -14,7 +14,7 @@ class UTMDates:
     link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
     sessionNumber = [5, 9]
     @staticmethod
-    def scrape(location='.', year=None): #scrapes most current sessions by default
+    def scrape(location='.', year=None, save=True):  # scrapes most current sessions by default
 
         year = year or datetime.datetime.now().year
 
@@ -28,15 +28,15 @@ def scrape(location='.', year=None): #scrapes most current sessions by default
             dates = content.find_all('div', class_='title')
             i = 0
             currentDate = dates[i]
-            while(i<len(dates)):
+            while(i < len(dates)):
                 date = dates[i].text
                 events = []
                 while (currentDate == dates[i]):
                     info = dates[i].find_next('div', class_='info')
                     description = info.text
-                    eventStartEnd = date.split('-') #splits event dates over a period
+                    eventStartEnd = date.split('-')  # splits event dates over a period
                     eventStart = UTMDates.convert_date(eventStartEnd[0].strip())
-                    if len(eventStartEnd)>1:
+                    if len(eventStartEnd) > 1:
                         eventEnd = UTMDates.convert_date(eventStartEnd[1].strip())
                     else:
                         eventEnd = eventStart
@@ -47,23 +47,23 @@ def scrape(location='.', year=None): #scrapes most current sessions by default
                             ('campus', 'UTM'),
                             ('description', description)
                         ]))
-                    i+=1
-                    if(i>=len(dates)):
-                        break;
-                calendar[date] = OrderedDict([
-                        ('date', eventStart),
-                        ('events', events)
-                    ])
-                if(i<len(dates)):
+                    i += 1
+                    if(i >= len(dates)):
+                        break
+                calendar[eventStart] = OrderedDict([
+                    ('date', eventStart),
+                    ('events', events)
+                ])
+                if(i < len(dates)):
                     currentDate = dates[i]
             currentSession = "{0} FALL/WINTER"
 
-
-        for date, info in calendar.items():
-            Scraper.save_json(info, location, UTMDates.convert_date(date))
+        if save:
+            for date, info in calendar.items():
+                Scraper.save_json(info, location, UTMDates.convert_date(date))
 
         Scraper.logger.info('UTMDates completed.')
-        return calendar
+        return calendar if not save else None
 
     @staticmethod
     def convert_date(date):

From fa4dfcbada87bae5085a6f2226693924283d8883 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Sun, 15 May 2016 20:43:01 -0400
Subject: [PATCH 08/10] Add EngDates scraper

---
 uoftscrapers/scrapers/dates/utsg.py | 102 ++++++++++++++++++++++------
 1 file changed, 82 insertions(+), 20 deletions(-)

diff --git a/uoftscrapers/scrapers/dates/utsg.py b/uoftscrapers/scrapers/dates/utsg.py
index 09b61a7..f0734e7 100644
--- a/uoftscrapers/scrapers/dates/utsg.py
+++ b/uoftscrapers/scrapers/dates/utsg.py
@@ -3,27 +3,41 @@
 from collections import OrderedDict
 from datetime import datetime
 from pytz import timezone
-from pprint import pprint
 from time import sleep
 import re
+import http.cookiejar
 
 
 class UTSGDates:
     """A scraper for UTSG important dates."""
 
     @staticmethod
-    def scrape(location='.'):
+    def scrape(location='.', year=None, save=True):
         Scraper.logger.info('UTSGDates initialized.')
 
-        # for faculty in ArtSciDates, EngDates:
-        #     docs = faculty.scrape(location, save=False)
-        #     if docs is not None:
-        #         for date, doc in docs.items():
-        #             Scraper.save_json(doc, location, date)
+        docs = OrderedDict()
+
+        for faculty in ArtSciDates, EngDates:
+            dates = faculty.scrape(location, year=year, save=False)
+
+            if dates is None:
+                continue
+
+            for date, doc in dates.items():
+                if date not in docs:
+                    docs[date] = OrderedDict([
+                        ('date', date),
+                        ('events', [])
+                    ])
 
-        EngDates.scrape(location)
+                docs[date]['events'].extend(doc['events'])
+
+        if save:
+            for date, doc in docs.items():
+                Scraper.save_json(doc, location, date)
 
         Scraper.logger.info('UTSGDates completed.')
+        return docs if not save else None
 
 
 class ArtSciDates:
@@ -190,7 +204,9 @@ class EngDates:
 
     host = 'http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm'
 
-    FORM_DATA = {
+    cookies = http.cookiejar.CookieJar()
+
+    form_data = {
         'viewstate': '__VIEWSTATE',
         'viewstate_generator': '__VIEWSTATEGENERATOR',
         'numerical_date': 'ctl02$ctlSelectedDate$hdnDateValueForQuestionnaireResponses',
@@ -205,7 +221,7 @@ def scrape(location='.', year=None, save=True):
         year = year or datetime.now().year
 
         viewstate, viewstate_generator, numerical_date, textual_date = \
-            EngDates.FORM_DATA.values()
+            EngDates.form_data.values()
 
         headers = {
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
@@ -223,36 +239,82 @@ def scrape(location='.', year=None, save=True):
         payload[viewstate], payload[viewstate_generator] = \
             EngDates.get_viewstate(s)
 
-        for i in range(1, 13):
-            month = datetime.strptime(str(i), '%m').strftime('%B')
-            payload[textual_date] = '%s 15 %s' % (month, year)
+        docs = OrderedDict()
+
+        for m in range(1, 13):
+            Scraper.logger.info('Scraping month %d' % m)
 
-            payload[numerical_date] = '%s.%s.15' % (year, str(i).zfill(2))
+            month = datetime.strptime(str(m), '%m').strftime('%B')
+            payload[textual_date] = '%s 15 %s' % (month, year)
+            payload[numerical_date] = '%s.%s.15' % (year, str(m).zfill(2))
 
             attempts = 0
 
-            html = s.post(EngDates.host, data=payload).text.encode('utf-8') or ''
+            html = s.post(EngDates.host,
+                          data=payload,
+                          cookies=EngDates.cookies).text.encode('utf-8') or ''
             soup = BeautifulSoup(html, 'html.parser')
 
             while attempts < 5 and soup.find('div', class_='error'):
-                print('attempt %d' % attempts)
+                Scraper.logger.info('Attempt #%d' % (attempts + 1))
 
-                html = s.post(EngDates.host, data=payload).text.encode('utf-8') or ''
+                html = s.post(EngDates.host,
+                              data=payload,
+                              cookies=EngDates.cookies).text.encode('utf-8') or ''
                 soup = BeautifulSoup(html, 'html.parser')
 
-                print(soup.find('div', class_='error'))
-
                 attempts += 1
                 sleep(1)
 
             if not html or soup.find('div', class_='error'):
+                Scraper.logger.info('Couldn\'t scrape month %d' % m)
                 continue
 
+            for tr in soup.find(id='ctl02_ctlCalendar').find_all('tr')[2:]:
+                for td in tr.find_all('td'):
+                    if not td.find('a') or not td.find('div').find('a'):
+                        continue
+
+                    start = end = EngDates.get_date(td.find('a')['title'], year)
+
+                    session = '%d ENGINEERING' % year
+
+                    events = []
+                    for div in td.find_all('div'):
+                        event = div.find('a')
+
+                        events.append(OrderedDict([
+                            ('end_date', end),
+                            ('session', session),
+                            ('campus', 'UTSG'),
+                            ('description', event.text)
+                        ]))
+
+                    if start not in docs:
+                        docs[start] = OrderedDict([
+                            ('date', start),
+                            ('events', events)
+                        ])
+                    else:
+                        docs[start]['events'].extend(events)
+
+        if save:
+            for date, doc in docs.items():
+                Scraper.save_json(doc, location, date)
+
         Scraper.logger.info('EngDates completed.')
+        return docs if not save else None
+
+    @staticmethod
+    def get_date(date, year):
+        """Return a IS0 8601 date from a date string of the form `M d`"""
+        date = '%s %s' % (year, date)
+        return datetime.strptime(date, '%Y %B %d').date().isoformat()
 
     @staticmethod
     def get_viewstate(s):
-        html = s.get(EngDates.host)
+        headers = {'Referer': EngDates.host}
+        html = s.get(EngDates.host, headers=headers)
         soup = BeautifulSoup(html.content, 'html.parser')
 
         return soup.find(id='__VIEWSTATE')['value'],\

From e5834bd7875d1acbf6fe6d44b1a9d5080cedb5c1 Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Sun, 15 May 2016 20:43:14 -0400
Subject: [PATCH 09/10] Merge UTM/UTSG docs

---
 uoftscrapers/scrapers/dates/__init__.py | 26 +++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/uoftscrapers/scrapers/dates/__init__.py b/uoftscrapers/scrapers/dates/__init__.py
index f434b7a..9432e1a 100644
--- a/uoftscrapers/scrapers/dates/__init__.py
+++ b/uoftscrapers/scrapers/dates/__init__.py
@@ -1,11 +1,33 @@
 from ..utils import Scraper
 from .utsg import UTSGDates
+from .utm import UTMDates
+
+from collections import OrderedDict
 
 
 class Dates:
 
     @staticmethod
-    def scrape(location='.'):
+    def scrape(location='.', year=None):
         Scraper.logger.info('Dates initialized.')
-        UTSGDates.scrape(location)
+
+        docs = OrderedDict()
+
+        for campus in UTSGDates, UTMDates:
+            dates = campus.scrape(location, year=year, save=False)
+
+            if dates is None:
+                continue
+
+            for date, doc in dates.items():
+                if date not in docs:
+                    docs[date] = OrderedDict([
+                        ('date', date),
+                        ('events', [])
+                    ])
+                docs[date]['events'].extend(doc['events'])
+
+        for date, doc in docs.items():
+            Scraper.save_json(doc, location, date)
+
         Scraper.logger.info('Dates completed.')

From 2a432e20db7ed1abf93314c65a3395d5614be57d Mon Sep 17 00:00:00 2001
From: Kashav Madan <kshvmdn@gmail.com>
Date: Sun, 15 May 2016 20:43:22 -0400
Subject: [PATCH 10/10] Add Dates reference

---
 README.md | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/README.md b/README.md
index a8621eb..08ddf29 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,9 @@ This is a library of scrapers for various University of Toronto websites. It is
   - [Shuttle Bus Schedule](#shuttles)
   - [Events](#events)
   - [Libraries](#libraries)
+  - [Dates](#Dates)
+    - [UTSG Dates](#utsg-dates)
+    - [UTM Dates](#utm-dates)
 
 ## Requirements
  - [python3](https://www.python.org/download/releases/3.5.1)
@@ -692,3 +695,62 @@ https://onesearch.library.utoronto.ca/
   }
 }
 ```
+
+--------------------------------------------------------------------------------
+
+### Dates
+
+##### Class name
+```python
+uoftscrapers.Dates
+```
+
+##### Scraper source
+ - [UTSG Dates](#utsg-dates)
+ - [UTM Dates](#utm-dates)
+
+##### Output format
+```js
+{
+  "date": String,
+  "events": [{
+    "end_date": String,
+    "session": String,
+    "campus": String,
+    "description": String
+  }]
+}
+```
+
+----------------------------------------
+
+### UTSG Dates
+
+##### Class name
+```python
+uoftscrapers.UTSGDates
+```
+
+##### Scraper source
+http://www.artsci.utoronto.ca/current/course/timetable/
+http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm
+
+##### Output format
+Refer to [Exams](#exams)
+
+--------------------
+
+### UTM Dates
+
+##### Class name
+```python
+uoftscrapers.UTMDates
+```
+
+##### Scraper source
+http://m.utm.utoronto.ca/importantDates.php
+
+##### Output format
+Refer to [Exams](#exams)
+
+--------------------------------------------------------------------------------