From 1416ef0c077f073d2ae6520e713c6c19a3ae9a10 Mon Sep 17 00:00:00 2001
From: Eugene Cheung <ey.cheung@outlook.com>
Date: Sun, 17 Apr 2016 16:39:15 -0400
Subject: [PATCH 1/2] Use Scraper.get in all scrapers

---
 uoftscrapers/scrapers/athletics/utm.py      |  2 +-
 uoftscrapers/scrapers/athletics/utsc.py     |  2 +-
 uoftscrapers/scrapers/buildings/__init__.py | 14 ++++----
 uoftscrapers/scrapers/courses/__init__.py   | 36 ++-------------------
 uoftscrapers/scrapers/exams/utm.py          | 12 +++----
 uoftscrapers/scrapers/exams/utsc.py         |  6 ++--
 uoftscrapers/scrapers/exams/utsg.py         |  8 ++---
 uoftscrapers/scrapers/food/__init__.py      |  6 ++--
 uoftscrapers/scrapers/parking/__init__.py   |  2 --
 uoftscrapers/scrapers/shuttle/__init__.py   |  2 +-
 uoftscrapers/scrapers/textbooks/__init__.py | 22 ++++++-------
 uoftscrapers/scrapers/timetable/utsg.py     |  4 +--
 uoftscrapers/scrapers/utils/scraper.py      | 21 ++++++++----
 13 files changed, 51 insertions(+), 86 deletions(-)

diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py
index 60ae255..bf2ded5 100644
--- a/uoftscrapers/scrapers/athletics/utm.py
+++ b/uoftscrapers/scrapers/athletics/utm.py
@@ -20,7 +20,7 @@ def scrape(location='.', month=None):
         month = month or UTMAthletics.get_month(month)
 
         Scraper.logger.info('UTMAthletics initialized.')
-        html = Scraper.get_html('%s%s' % (UTMAthletics.host, month))
+        html = Scraper.get('%s%s' % (UTMAthletics.host, month))
         soup = BeautifulSoup(html, 'html.parser')
 
         athletics = OrderedDict()
diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py
index 5a8f1a5..f77f691 100644
--- a/uoftscrapers/scrapers/athletics/utsc.py
+++ b/uoftscrapers/scrapers/athletics/utsc.py
@@ -20,7 +20,7 @@ def scrape(location='.', month=None):
         month = month or UTSCAthletics.get_month(month)
 
         Scraper.logger.info('UTSCAthletics initialized.')
-        html = Scraper.get_html('%s%s' % (UTSCAthletics.host, month))
+        html = Scraper.get('%s%s' % (UTSCAthletics.host, month))
         soup = BeautifulSoup(html, 'html.parser')
 
         athletics = OrderedDict()
diff --git a/uoftscrapers/scrapers/buildings/__init__.py b/uoftscrapers/scrapers/buildings/__init__.py
index f3be615..ca38d20 100644
--- a/uoftscrapers/scrapers/buildings/__init__.py
+++ b/uoftscrapers/scrapers/buildings/__init__.py
@@ -5,7 +5,6 @@
 import json
 import os
 import re
-import requests
 
 
 class Buildings:
@@ -16,7 +15,6 @@ class Buildings:
 
     host = 'http://map.utoronto.ca/'
     campuses = ['utsg', 'utm', 'utsc']
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -82,14 +80,14 @@ def get_map_json(campus):
 
         Scraper.logger.info('Scraping %s.' % campus.upper())
 
-        Buildings.s.get(Buildings.host)
+        Scraper.get(Buildings.host)
 
         headers = { 'Referer': Buildings.host }
-        html = Buildings.s.get('%s%s%s' % (
+        html = Scraper.get('%s%s%s' % (
             Buildings.host,
             'data/map/',
             campus
-        ), headers=headers).text
+        ), headers=headers)
 
         data = json.loads(html)
         return data
@@ -98,14 +96,14 @@ def get_map_json(campus):
     def get_regions_json(campus):
         """Retrieve the JSON structure from host."""
 
-        Buildings.s.get(Buildings.host)
+        Scraper.get(Buildings.host)
 
         headers = { 'Referer': Buildings.host }
-        html = Buildings.s.get('%s%s%s' % (
+        html = Scraper.get('%s%s%s' % (
             Buildings.host,
             'data/regions/',
             campus
-        ), headers=headers).text
+        ), headers=headers)
 
         data = json.loads(html)
         return data
diff --git a/uoftscrapers/scrapers/courses/__init__.py b/uoftscrapers/scrapers/courses/__init__.py
index 84f69eb..008ca5f 100755
--- a/uoftscrapers/scrapers/courses/__init__.py
+++ b/uoftscrapers/scrapers/courses/__init__.py
@@ -3,13 +3,11 @@
 from collections import OrderedDict
 from queue import Queue
 from threading import Thread, Lock
-from time import time, sleep
+from time import time
 import http.cookiejar
 import json
-import logging
 import os
 import re
-import requests
 import sys
 
 
@@ -21,7 +19,6 @@ class Courses:
 
     host = 'http://coursefinder.utoronto.ca/course-search/search'
     cookies = http.cookiejar.CookieJar()
-    s = requests.Session()
     threads = 32
 
     @staticmethod
@@ -74,37 +71,10 @@ def search(query='', requirements=''):
             'campusParam': 'St. George,Scarborough,Mississauga'
         }
 
-        # Keep trying to get data until a proper response is given
-        json = None
-        while json is None:
-            try:
-                r = Courses.s.get(url, params=data,
-                    cookies=Courses.cookies)
-                if r.status_code == 200:
-                    json = r.json()
-                else:
-                    sleep(0.5)
-            except requests.exceptions.Timeout:
-                continue
+        json = Scraper.get(url, params=data, cookies=Courses.cookies, json=True)
 
         return json['aaData']
 
-    @staticmethod
-    def get_course_html(url):
-        """Update the locally stored course pages."""
-
-        html = None
-        while html is None:
-            try:
-                r = Courses.s.get(url, cookies=Courses.cookies)
-                if r.status_code == 200:
-                    html = r.text
-            except (requests.exceptions.Timeout,
-                    requests.exceptions.ConnectionError):
-                continue
-
-        return html.encode('utf-8')
-
     @staticmethod
     def parse_course_html(course_id, html):
         """Create JSON files from the HTML pages downloaded."""
@@ -283,7 +253,7 @@ def __init__(self, queue):
     def run(self):
         while True:
             course_id, url, total = self.queue.get()
-            html = Courses.get_course_html(url)
+            html = Scraper.get(url)
             course = Courses.parse_course_html(course_id, html)
 
             CourseFinderWorker.lock.acquire()
diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py
index 80bb36e..87b2473 100644
--- a/uoftscrapers/scrapers/exams/utm.py
+++ b/uoftscrapers/scrapers/exams/utm.py
@@ -1,8 +1,7 @@
 from ..utils import Scraper
 from bs4 import BeautifulSoup
-from datetime import datetime
 from collections import OrderedDict
-import requests
+from datetime import datetime
 import pytz
 import re
 
@@ -11,7 +10,6 @@ class UTMExams:
     """A scraper for UTM exams."""
 
     host = 'https://m.utm.utoronto.ca/'
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -42,8 +40,8 @@ def retrieve_exams(courses):
             headers = {
                 'Referer': UTMExams.host
             }
-            html = UTMExams.s.get('%s%s' % (UTMExams.host, course),
-                                  headers=headers).text
+            html = Scraper.get('%s%s' % (UTMExams.host, course),
+                                  headers=headers)
             soup = BeautifulSoup(html, 'html.parser')
 
             course_code = soup.find('div', class_='title').text.strip()
@@ -103,8 +101,8 @@ def get_page_links(endpoint):
         headers = {
             'Referer': UTMExams.host
         }
-        html = UTMExams.s.get('%s%s' % (UTMExams.host, endpoint),
-                              headers=headers).text
+        html = Scraper.get('%s%s' % (UTMExams.host, endpoint),
+                              headers=headers)
         soup = BeautifulSoup(html, 'html.parser')
         return [li.find('a')['href']
                 for li in soup.find('ul', class_='link').find_all('li')]
diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py
index 02c5c09..f7a0ae0 100644
--- a/uoftscrapers/scrapers/exams/utsc.py
+++ b/uoftscrapers/scrapers/exams/utsc.py
@@ -1,8 +1,7 @@
 from ..utils import Scraper
 from bs4 import BeautifulSoup
-from datetime import datetime
 from collections import OrderedDict
-import requests
+from datetime import datetime
 import pytz
 
 
@@ -10,7 +9,6 @@ class UTSCExams:
     """A scraper for UTSC exams."""
 
     host = 'http://www.utsc.utoronto.ca/registrar/examination-schedule'
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -21,7 +19,7 @@ def scrape(location='.'):
         headers = {
             'Referer': UTSCExams.host
         }
-        html = UTSCExams.s.get('%s' % UTSCExams.host, headers=headers).text
+        html = Scraper.get('%s' % UTSCExams.host, headers=headers)
         soup = BeautifulSoup(html, 'html.parser')
 
         for table in soup.find_all('table', class_='views-table'):
diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py
index 7ea6fa3..f7035ed 100644
--- a/uoftscrapers/scrapers/exams/utsg.py
+++ b/uoftscrapers/scrapers/exams/utsg.py
@@ -1,8 +1,7 @@
 from ..utils import Scraper
 from bs4 import BeautifulSoup
-from datetime import datetime, date
 from collections import OrderedDict
-import requests
+from datetime import datetime, date
 import pytz
 
 
@@ -13,7 +12,6 @@ class UTSGExams:
     """
 
     host = 'http://www.artsci.utoronto.ca/current/exams/'
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.', year=None):
@@ -29,8 +27,8 @@ def scrape(location='.', year=None):
             headers = {
                 'Referer': UTSGExams.host
             }
-            html = UTSGExams.s.get('%s%s' % (UTSGExams.host, p),
-                                   headers=headers).text
+            html = Scraper.get('%s%s' % (UTSGExams.host, p),
+                                   headers=headers)
             soup = BeautifulSoup(html, 'html.parser')
 
             if not soup.find('table', class_='vertical listing'):
diff --git a/uoftscrapers/scrapers/food/__init__.py b/uoftscrapers/scrapers/food/__init__.py
index 34533f7..939174b 100644
--- a/uoftscrapers/scrapers/food/__init__.py
+++ b/uoftscrapers/scrapers/food/__init__.py
@@ -1,7 +1,6 @@
 from ..utils import Scraper, LayersScraper
 from bs4 import BeautifulSoup
 from collections import OrderedDict
-import requests
 
 
 class Food:
@@ -12,7 +11,6 @@ class Food:
 
     host = 'http://map.utoronto.ca/'
     campuses = [('utsg', 2), ('utm', 1), ('utsc', 0)]
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -101,8 +99,8 @@ def conv_time(t):
         headers = {
             'Referer': Food.host
         }
-        html = Food.s.get('%s%s%s' % (Food.host, 'json/hours/', food_id),
-            headers=headers).text
+        html = Scraper.get('%s%s%s' % (Food.host, 'json/hours/', food_id),
+            headers=headers)
         soup = BeautifulSoup(html, 'html.parser')
 
         hours = OrderedDict()
diff --git a/uoftscrapers/scrapers/parking/__init__.py b/uoftscrapers/scrapers/parking/__init__.py
index af8b016..8bcc054 100644
--- a/uoftscrapers/scrapers/parking/__init__.py
+++ b/uoftscrapers/scrapers/parking/__init__.py
@@ -3,7 +3,6 @@
 from collections import OrderedDict
 from pprint import pprint
 import json
-import requests
 
 
 class Parking:
@@ -18,7 +17,6 @@ class Parking:
         'utm': 6,
         'utsc': 5
     }
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
diff --git a/uoftscrapers/scrapers/shuttle/__init__.py b/uoftscrapers/scrapers/shuttle/__init__.py
index 65a1188..dbe7376 100644
--- a/uoftscrapers/scrapers/shuttle/__init__.py
+++ b/uoftscrapers/scrapers/shuttle/__init__.py
@@ -35,7 +35,7 @@ def scrape(location='.', month=None):
         Scraper.logger.info('Fetching schedules for {0}-{1}-01 to {0}-{1}-{2}.'.format(year, month, days))
 
         for day in range(1, days + 1):
-            html = Scraper.get_html(Shuttle.host % (year, month, day))
+            html = Scraper.get(Shuttle.host % (year, month, day))
             schedule = Shuttle.parse_schedule_html(html)
 
             Scraper.save_json(schedule, location, schedule['date'])
diff --git a/uoftscrapers/scrapers/textbooks/__init__.py b/uoftscrapers/scrapers/textbooks/__init__.py
index 4c2f4e6..21793b9 100644
--- a/uoftscrapers/scrapers/textbooks/__init__.py
+++ b/uoftscrapers/scrapers/textbooks/__init__.py
@@ -6,10 +6,8 @@
 from queue import Queue
 from threading import Thread, Lock
 from time import time
-import logging
 import os
 import re
-import requests
 import sys
 
 
@@ -104,9 +102,9 @@ def scrape(location='.'):
 
     @staticmethod
     def retrieve_terms():
-        r = requests.get('%s/buy_courselisting.asp' % Textbooks.host)
+        html = Scraper.get('%s/buy_courselisting.asp' % Textbooks.host)
 
-        listing = BeautifulSoup(r.text, "html.parser")
+        listing = BeautifulSoup(html, "html.parser")
         terms = listing.find(id='fTerm').find_all('option')[1:]
 
         accepted_terms = []
@@ -138,10 +136,10 @@ def retrieve_departments(terms):
                 'Referer': '%s/buy_courselisting.asp' % Textbooks.host
             }
 
-            r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+            xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
                 params=payload, headers=headers)
 
-            departments = BeautifulSoup(r.text, "xml").find_all('department')
+            departments = BeautifulSoup(xml, "xml").find_all('department')
             for department in departments:
                 all_departments.append({
                     'dept_id': department.get('id'),
@@ -168,10 +166,10 @@ def retrieve_courses(department):
             'Referer': '%s/buy_courselisting.asp' % Textbooks.host
         }
 
-        r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+        xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
             params=payload, headers=headers)
 
-        courses = BeautifulSoup(r.text, "xml").find_all('course')
+        courses = BeautifulSoup(xml, "xml").find_all('course')
         for course in courses:
             all_courses.append({
                 'course_id': course.get('id'),
@@ -196,10 +194,10 @@ def retrieve_sections(course):
             'Referer': '%s/buy_courselisting.asp' % Textbooks.host
         }
 
-        r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+        xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
             params=payload, headers=headers)
 
-        sections = BeautifulSoup(r.text, "xml").find_all('section')
+        sections = BeautifulSoup(xml, "xml").find_all('section')
         for section in sections:
             all_sections.append({
                 'section_id': section.get('id'),
@@ -224,10 +222,10 @@ def retrieve_books(section):
             'Referer': '%s/buy_courselisting.asp' % Textbooks.host
         }
 
-        r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+        xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
             params=payload, headers=headers)
 
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(xml, "html.parser")
         books = soup.find_all('tr', { 'class': 'book' })
 
         if books == None:
diff --git a/uoftscrapers/scrapers/timetable/utsg.py b/uoftscrapers/scrapers/timetable/utsg.py
index 9fd774e..d91eef1 100644
--- a/uoftscrapers/scrapers/timetable/utsg.py
+++ b/uoftscrapers/scrapers/timetable/utsg.py
@@ -41,7 +41,7 @@ def scrape(location):
                     sponsor.split('.')[0]
                 ))
 
-                html = Scraper.get_html('%s/%s/%s' % (
+                html = Scraper.get('%s/%s/%s' % (
                     UTSGTimetable.host,
                     term,
                     sponsor
@@ -330,7 +330,7 @@ def format_data(text, regex):
 
     @staticmethod
     def get_sponsors(term):
-        html = Scraper.get_html('%s/%s/index.html' % (
+        html = Scraper.get('%s/%s/index.html' % (
             UTSGTimetable.host,
             term
         ))
diff --git a/uoftscrapers/scrapers/utils/scraper.py b/uoftscrapers/scrapers/utils/scraper.py
index 42a058f..4812d24 100644
--- a/uoftscrapers/scrapers/utils/scraper.py
+++ b/uoftscrapers/scrapers/utils/scraper.py
@@ -1,3 +1,4 @@
+from time import sleep
 import json
 import logging
 import os
@@ -27,22 +28,30 @@ def save_json(data, location, filename):
             json.dump(data, outfile)
 
     @staticmethod
-    def get_html(url, params=None, cookies=None, headers=None, max_attempts=10):
-        """Fetches the HTML page source, automatically retrying if it times out."""
+    def get(url, params=None, cookies=None, headers=None, json=False, max_attempts=10):
+        """Fetches an Internet document, automatically retrying if it times out."""
 
-        html = None
+        doc = None
         attempts = 0
-        while html is None and attempts < max_attempts:
+        while doc is None and attempts < max_attempts:
             try:
                 r = Scraper.s.get(url, params=params, cookies=cookies, headers=headers)
                 if r.status_code == 200:
-                    html = r.text
+                    doc = r
+                else:
+                    sleep(0.5)
             except (requests.exceptions.Timeout,
                     requests.exceptions.ConnectionError):
                 attempts += 1
                 continue
 
-        return html.encode('utf-8') if html else None
+        if doc is None:
+            return None
+
+        if json:
+            return doc.json()
+        else:
+            return doc.text.encode('utf-8')
 
     @staticmethod
     def flush_percentage(decimal):

From 9b1d3c720a4be9401b630bab676176019baf7fc6 Mon Sep 17 00:00:00 2001
From: Eugene Cheung <ey.cheung@outlook.com>
Date: Sun, 17 Apr 2016 16:42:18 -0400
Subject: [PATCH 2/2] Course cookies

---
 uoftscrapers/scrapers/courses/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uoftscrapers/scrapers/courses/__init__.py b/uoftscrapers/scrapers/courses/__init__.py
index 008ca5f..c73a013 100755
--- a/uoftscrapers/scrapers/courses/__init__.py
+++ b/uoftscrapers/scrapers/courses/__init__.py
@@ -253,7 +253,7 @@ def __init__(self, queue):
     def run(self):
         while True:
             course_id, url, total = self.queue.get()
-            html = Scraper.get(url)
+            html = Scraper.get(url, Courses.cookies)
             course = Courses.parse_course_html(course_id, html)
 
             CourseFinderWorker.lock.acquire()