Merge pull request #55 from arkon/parent-get

Use Scraper.get in all scrapers
cobalt-uoft · Apr 17, 2016 · c2a6c76 · c2a6c76
2 parents e1d388e + 9b1d3c7
commit c2a6c76
Show file tree

Hide file tree

Showing 13 changed files with 51 additions and 86 deletions.
diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py
@@ -20,7 +20,7 @@ def scrape(location='.', month=None):
         month = month or UTMAthletics.get_month(month)
 
         Scraper.logger.info('UTMAthletics initialized.')
-        html = Scraper.get_html('%s%s' % (UTMAthletics.host, month))
+        html = Scraper.get('%s%s' % (UTMAthletics.host, month))
         soup = BeautifulSoup(html, 'html.parser')
 
         athletics = OrderedDict()

diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py
@@ -20,7 +20,7 @@ def scrape(location='.', month=None):
         month = month or UTSCAthletics.get_month(month)
 
         Scraper.logger.info('UTSCAthletics initialized.')
-        html = Scraper.get_html('%s%s' % (UTSCAthletics.host, month))
+        html = Scraper.get('%s%s' % (UTSCAthletics.host, month))
         soup = BeautifulSoup(html, 'html.parser')
 
         athletics = OrderedDict()

diff --git a/uoftscrapers/scrapers/buildings/__init__.py b/uoftscrapers/scrapers/buildings/__init__.py
@@ -5,7 +5,6 @@
 import json
 import os
 import re
-import requests
 
 
 class Buildings:
@@ -16,7 +15,6 @@ class Buildings:
 
     host = 'http://map.utoronto.ca/'
     campuses = ['utsg', 'utm', 'utsc']
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -82,14 +80,14 @@ def get_map_json(campus):
 
         Scraper.logger.info('Scraping %s.' % campus.upper())
 
-        Buildings.s.get(Buildings.host)
+        Scraper.get(Buildings.host)
 
         headers = { 'Referer': Buildings.host }
-        html = Buildings.s.get('%s%s%s' % (
+        html = Scraper.get('%s%s%s' % (
             Buildings.host,
             'data/map/',
             campus
-        ), headers=headers).text
+        ), headers=headers)
 
         data = json.loads(html)
         return data
@@ -98,14 +96,14 @@ def get_map_json(campus):
     def get_regions_json(campus):
         """Retrieve the JSON structure from host."""
 
-        Buildings.s.get(Buildings.host)
+        Scraper.get(Buildings.host)
 
         headers = { 'Referer': Buildings.host }
-        html = Buildings.s.get('%s%s%s' % (
+        html = Scraper.get('%s%s%s' % (
             Buildings.host,
             'data/regions/',
             campus
-        ), headers=headers).text
+        ), headers=headers)
 
         data = json.loads(html)
         return data
diff --git a/uoftscrapers/scrapers/courses/__init__.py b/uoftscrapers/scrapers/courses/__init__.py
@@ -3,13 +3,11 @@
 from collections import OrderedDict
 from queue import Queue
 from threading import Thread, Lock
-from time import time, sleep
+from time import time
 import http.cookiejar
 import json
-import logging
 import os
 import re
-import requests
 import sys
 
 
@@ -21,7 +19,6 @@ class Courses:
 
     host = 'http://coursefinder.utoronto.ca/course-search/search'
     cookies = http.cookiejar.CookieJar()
-    s = requests.Session()
     threads = 32
 
     @staticmethod
@@ -74,37 +71,10 @@ def search(query='', requirements=''):
             'campusParam': 'St. George,Scarborough,Mississauga'
         }
 
-        # Keep trying to get data until a proper response is given
-        json = None
-        while json is None:
-            try:
-                r = Courses.s.get(url, params=data,
-                    cookies=Courses.cookies)
-                if r.status_code == 200:
-                    json = r.json()
-                else:
-                    sleep(0.5)
-            except requests.exceptions.Timeout:
-                continue
+        json = Scraper.get(url, params=data, cookies=Courses.cookies, json=True)
 
         return json['aaData']
 
-    @staticmethod
-    def get_course_html(url):
-        """Update the locally stored course pages."""
-
-        html = None
-        while html is None:
-            try:
-                r = Courses.s.get(url, cookies=Courses.cookies)
-                if r.status_code == 200:
-                    html = r.text
-            except (requests.exceptions.Timeout,
-                    requests.exceptions.ConnectionError):
-                continue
-
-        return html.encode('utf-8')
-
     @staticmethod
     def parse_course_html(course_id, html):
         """Create JSON files from the HTML pages downloaded."""
@@ -283,7 +253,7 @@ def __init__(self, queue):
     def run(self):
         while True:
             course_id, url, total = self.queue.get()
-            html = Courses.get_course_html(url)
+            html = Scraper.get(url, Courses.cookies)
             course = Courses.parse_course_html(course_id, html)
 
             CourseFinderWorker.lock.acquire()

diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py
@@ -1,8 +1,7 @@
 from ..utils import Scraper
 from bs4 import BeautifulSoup
-from datetime import datetime
 from collections import OrderedDict
-import requests
+from datetime import datetime
 import pytz
 import re
 
@@ -11,7 +10,6 @@ class UTMExams:
     """A scraper for UTM exams."""
 
     host = 'https://m.utm.utoronto.ca/'
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -42,8 +40,8 @@ def retrieve_exams(courses):
             headers = {
                 'Referer': UTMExams.host
             }
-            html = UTMExams.s.get('%s%s' % (UTMExams.host, course),
-                                  headers=headers).text
+            html = Scraper.get('%s%s' % (UTMExams.host, course),
+                                  headers=headers)
             soup = BeautifulSoup(html, 'html.parser')
 
             course_code = soup.find('div', class_='title').text.strip()
@@ -103,8 +101,8 @@ def get_page_links(endpoint):
         headers = {
             'Referer': UTMExams.host
         }
-        html = UTMExams.s.get('%s%s' % (UTMExams.host, endpoint),
-                              headers=headers).text
+        html = Scraper.get('%s%s' % (UTMExams.host, endpoint),
+                              headers=headers)
         soup = BeautifulSoup(html, 'html.parser')
         return [li.find('a')['href']
                 for li in soup.find('ul', class_='link').find_all('li')]

diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py
@@ -1,16 +1,14 @@
 from ..utils import Scraper
 from bs4 import BeautifulSoup
-from datetime import datetime
 from collections import OrderedDict
-import requests
+from datetime import datetime
 import pytz
 
 
 class UTSCExams:
     """A scraper for UTSC exams."""
 
     host = 'http://www.utsc.utoronto.ca/registrar/examination-schedule'
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -21,7 +19,7 @@ def scrape(location='.'):
         headers = {
             'Referer': UTSCExams.host
         }
-        html = UTSCExams.s.get('%s' % UTSCExams.host, headers=headers).text
+        html = Scraper.get('%s' % UTSCExams.host, headers=headers)
         soup = BeautifulSoup(html, 'html.parser')
 
         for table in soup.find_all('table', class_='views-table'):

diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py
@@ -1,8 +1,7 @@
 from ..utils import Scraper
 from bs4 import BeautifulSoup
-from datetime import datetime, date
 from collections import OrderedDict
-import requests
+from datetime import datetime, date
 import pytz
 
 
@@ -13,7 +12,6 @@ class UTSGExams:
     """
 
     host = 'http://www.artsci.utoronto.ca/current/exams/'
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.', year=None):
@@ -29,8 +27,8 @@ def scrape(location='.', year=None):
             headers = {
                 'Referer': UTSGExams.host
             }
-            html = UTSGExams.s.get('%s%s' % (UTSGExams.host, p),
-                                   headers=headers).text
+            html = Scraper.get('%s%s' % (UTSGExams.host, p),
+                                   headers=headers)
             soup = BeautifulSoup(html, 'html.parser')
 
             if not soup.find('table', class_='vertical listing'):

diff --git a/uoftscrapers/scrapers/food/__init__.py b/uoftscrapers/scrapers/food/__init__.py
@@ -1,7 +1,6 @@
 from ..utils import Scraper, LayersScraper
 from bs4 import BeautifulSoup
 from collections import OrderedDict
-import requests
 
 
 class Food:
@@ -12,7 +11,6 @@ class Food:
 
     host = 'http://map.utoronto.ca/'
     campuses = [('utsg', 2), ('utm', 1), ('utsc', 0)]
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):
@@ -101,8 +99,8 @@ def conv_time(t):
         headers = {
             'Referer': Food.host
         }
-        html = Food.s.get('%s%s%s' % (Food.host, 'json/hours/', food_id),
-            headers=headers).text
+        html = Scraper.get('%s%s%s' % (Food.host, 'json/hours/', food_id),
+            headers=headers)
         soup = BeautifulSoup(html, 'html.parser')
 
         hours = OrderedDict()

diff --git a/uoftscrapers/scrapers/parking/__init__.py b/uoftscrapers/scrapers/parking/__init__.py
@@ -3,7 +3,6 @@
 from collections import OrderedDict
 from pprint import pprint
 import json
-import requests
 
 
 class Parking:
@@ -18,7 +17,6 @@ class Parking:
         'utm': 6,
         'utsc': 5
     }
-    s = requests.Session()
 
     @staticmethod
     def scrape(location='.'):

diff --git a/uoftscrapers/scrapers/shuttle/__init__.py b/uoftscrapers/scrapers/shuttle/__init__.py
@@ -35,7 +35,7 @@ def scrape(location='.', month=None):
         Scraper.logger.info('Fetching schedules for {0}-{1}-01 to {0}-{1}-{2}.'.format(year, month, days))
 
         for day in range(1, days + 1):
-            html = Scraper.get_html(Shuttle.host % (year, month, day))
+            html = Scraper.get(Shuttle.host % (year, month, day))
             schedule = Shuttle.parse_schedule_html(html)
 
             Scraper.save_json(schedule, location, schedule['date'])

diff --git a/uoftscrapers/scrapers/textbooks/__init__.py b/uoftscrapers/scrapers/textbooks/__init__.py
@@ -6,10 +6,8 @@
 from queue import Queue
 from threading import Thread, Lock
 from time import time
-import logging
 import os
 import re
-import requests
 import sys
 
 
@@ -104,9 +102,9 @@ def scrape(location='.'):
 
     @staticmethod
     def retrieve_terms():
-        r = requests.get('%s/buy_courselisting.asp' % Textbooks.host)
+        html = Scraper.get('%s/buy_courselisting.asp' % Textbooks.host)
 
-        listing = BeautifulSoup(r.text, "html.parser")
+        listing = BeautifulSoup(html, "html.parser")
         terms = listing.find(id='fTerm').find_all('option')[1:]
 
         accepted_terms = []
@@ -138,10 +136,10 @@ def retrieve_departments(terms):
                 'Referer': '%s/buy_courselisting.asp' % Textbooks.host
             }
 
-            r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+            xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
                 params=payload, headers=headers)
 
-            departments = BeautifulSoup(r.text, "xml").find_all('department')
+            departments = BeautifulSoup(xml, "xml").find_all('department')
             for department in departments:
                 all_departments.append({
                     'dept_id': department.get('id'),
@@ -168,10 +166,10 @@ def retrieve_courses(department):
             'Referer': '%s/buy_courselisting.asp' % Textbooks.host
         }
 
-        r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+        xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
             params=payload, headers=headers)
 
-        courses = BeautifulSoup(r.text, "xml").find_all('course')
+        courses = BeautifulSoup(xml, "xml").find_all('course')
         for course in courses:
             all_courses.append({
                 'course_id': course.get('id'),
@@ -196,10 +194,10 @@ def retrieve_sections(course):
             'Referer': '%s/buy_courselisting.asp' % Textbooks.host
         }
 
-        r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+        xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
             params=payload, headers=headers)
 
-        sections = BeautifulSoup(r.text, "xml").find_all('section')
+        sections = BeautifulSoup(xml, "xml").find_all('section')
         for section in sections:
             all_sections.append({
                 'section_id': section.get('id'),
@@ -224,10 +222,10 @@ def retrieve_books(section):
             'Referer': '%s/buy_courselisting.asp' % Textbooks.host
         }
 
-        r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
+        xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
             params=payload, headers=headers)
 
-        soup = BeautifulSoup(r.text, "html.parser")
+        soup = BeautifulSoup(xml, "html.parser")
         books = soup.find_all('tr', { 'class': 'book' })
 
         if books == None:

diff --git a/uoftscrapers/scrapers/timetable/utsg.py b/uoftscrapers/scrapers/timetable/utsg.py
@@ -41,7 +41,7 @@ def scrape(location):
                     sponsor.split('.')[0]
                 ))
 
-                html = Scraper.get_html('%s/%s/%s' % (
+                html = Scraper.get('%s/%s/%s' % (
                     UTSGTimetable.host,
                     term,
                     sponsor
@@ -330,7 +330,7 @@ def format_data(text, regex):
 
     @staticmethod
     def get_sponsors(term):
-        html = Scraper.get_html('%s/%s/index.html' % (
+        html = Scraper.get('%s/%s/index.html' % (
             UTSGTimetable.host,
             term
         ))