From 1416ef0c077f073d2ae6520e713c6c19a3ae9a10 Mon Sep 17 00:00:00 2001 From: Eugene Cheung Date: Sun, 17 Apr 2016 16:39:15 -0400 Subject: [PATCH 1/2] Use Scraper.get in all scrapers --- uoftscrapers/scrapers/athletics/utm.py | 2 +- uoftscrapers/scrapers/athletics/utsc.py | 2 +- uoftscrapers/scrapers/buildings/__init__.py | 14 ++++---- uoftscrapers/scrapers/courses/__init__.py | 36 ++------------------- uoftscrapers/scrapers/exams/utm.py | 12 +++---- uoftscrapers/scrapers/exams/utsc.py | 6 ++-- uoftscrapers/scrapers/exams/utsg.py | 8 ++--- uoftscrapers/scrapers/food/__init__.py | 6 ++-- uoftscrapers/scrapers/parking/__init__.py | 2 -- uoftscrapers/scrapers/shuttle/__init__.py | 2 +- uoftscrapers/scrapers/textbooks/__init__.py | 22 ++++++------- uoftscrapers/scrapers/timetable/utsg.py | 4 +-- uoftscrapers/scrapers/utils/scraper.py | 21 ++++++++---- 13 files changed, 51 insertions(+), 86 deletions(-) diff --git a/uoftscrapers/scrapers/athletics/utm.py b/uoftscrapers/scrapers/athletics/utm.py index 60ae255..bf2ded5 100644 --- a/uoftscrapers/scrapers/athletics/utm.py +++ b/uoftscrapers/scrapers/athletics/utm.py @@ -20,7 +20,7 @@ def scrape(location='.', month=None): month = month or UTMAthletics.get_month(month) Scraper.logger.info('UTMAthletics initialized.') - html = Scraper.get_html('%s%s' % (UTMAthletics.host, month)) + html = Scraper.get('%s%s' % (UTMAthletics.host, month)) soup = BeautifulSoup(html, 'html.parser') athletics = OrderedDict() diff --git a/uoftscrapers/scrapers/athletics/utsc.py b/uoftscrapers/scrapers/athletics/utsc.py index 5a8f1a5..f77f691 100644 --- a/uoftscrapers/scrapers/athletics/utsc.py +++ b/uoftscrapers/scrapers/athletics/utsc.py @@ -20,7 +20,7 @@ def scrape(location='.', month=None): month = month or UTSCAthletics.get_month(month) Scraper.logger.info('UTSCAthletics initialized.') - html = Scraper.get_html('%s%s' % (UTSCAthletics.host, month)) + html = Scraper.get('%s%s' % (UTSCAthletics.host, month)) soup = BeautifulSoup(html, 'html.parser') athletics = OrderedDict() diff --git a/uoftscrapers/scrapers/buildings/__init__.py b/uoftscrapers/scrapers/buildings/__init__.py index f3be615..ca38d20 100644 --- a/uoftscrapers/scrapers/buildings/__init__.py +++ b/uoftscrapers/scrapers/buildings/__init__.py @@ -5,7 +5,6 @@ import json import os import re -import requests class Buildings: @@ -16,7 +15,6 @@ class Buildings: host = 'http://map.utoronto.ca/' campuses = ['utsg', 'utm', 'utsc'] - s = requests.Session() @staticmethod def scrape(location='.'): @@ -82,14 +80,14 @@ def get_map_json(campus): Scraper.logger.info('Scraping %s.' % campus.upper()) - Buildings.s.get(Buildings.host) + Scraper.get(Buildings.host) headers = { 'Referer': Buildings.host } - html = Buildings.s.get('%s%s%s' % ( + html = Scraper.get('%s%s%s' % ( Buildings.host, 'data/map/', campus - ), headers=headers).text + ), headers=headers) data = json.loads(html) return data @@ -98,14 +96,14 @@ def get_map_json(campus): def get_regions_json(campus): """Retrieve the JSON structure from host.""" - Buildings.s.get(Buildings.host) + Scraper.get(Buildings.host) headers = { 'Referer': Buildings.host } - html = Buildings.s.get('%s%s%s' % ( + html = Scraper.get('%s%s%s' % ( Buildings.host, 'data/regions/', campus - ), headers=headers).text + ), headers=headers) data = json.loads(html) return data diff --git a/uoftscrapers/scrapers/courses/__init__.py b/uoftscrapers/scrapers/courses/__init__.py index 84f69eb..008ca5f 100755 --- a/uoftscrapers/scrapers/courses/__init__.py +++ b/uoftscrapers/scrapers/courses/__init__.py @@ -3,13 +3,11 @@ from collections import OrderedDict from queue import Queue from threading import Thread, Lock -from time import time, sleep +from time import time import http.cookiejar import json -import logging import os import re -import requests import sys @@ -21,7 +19,6 @@ class Courses: host = 'http://coursefinder.utoronto.ca/course-search/search' cookies = http.cookiejar.CookieJar() - s = requests.Session() threads = 32 @staticmethod @@ -74,37 +71,10 @@ def search(query='', requirements=''): 'campusParam': 'St. George,Scarborough,Mississauga' } - # Keep trying to get data until a proper response is given - json = None - while json is None: - try: - r = Courses.s.get(url, params=data, - cookies=Courses.cookies) - if r.status_code == 200: - json = r.json() - else: - sleep(0.5) - except requests.exceptions.Timeout: - continue + json = Scraper.get(url, params=data, cookies=Courses.cookies, json=True) return json['aaData'] - @staticmethod - def get_course_html(url): - """Update the locally stored course pages.""" - - html = None - while html is None: - try: - r = Courses.s.get(url, cookies=Courses.cookies) - if r.status_code == 200: - html = r.text - except (requests.exceptions.Timeout, - requests.exceptions.ConnectionError): - continue - - return html.encode('utf-8') - @staticmethod def parse_course_html(course_id, html): """Create JSON files from the HTML pages downloaded.""" @@ -283,7 +253,7 @@ def __init__(self, queue): def run(self): while True: course_id, url, total = self.queue.get() - html = Courses.get_course_html(url) + html = Scraper.get(url) course = Courses.parse_course_html(course_id, html) CourseFinderWorker.lock.acquire() diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py index 80bb36e..87b2473 100644 --- a/uoftscrapers/scrapers/exams/utm.py +++ b/uoftscrapers/scrapers/exams/utm.py @@ -1,8 +1,7 @@ from ..utils import Scraper from bs4 import BeautifulSoup -from datetime import datetime from collections import OrderedDict -import requests +from datetime import datetime import pytz import re @@ -11,7 +10,6 @@ class UTMExams: """A scraper for UTM exams.""" host = 'https://m.utm.utoronto.ca/' - s = requests.Session() @staticmethod def scrape(location='.'): @@ -42,8 +40,8 @@ def retrieve_exams(courses): headers = { 'Referer': UTMExams.host } - html = UTMExams.s.get('%s%s' % (UTMExams.host, course), - headers=headers).text + html = Scraper.get('%s%s' % (UTMExams.host, course), + headers=headers) soup = BeautifulSoup(html, 'html.parser') course_code = soup.find('div', class_='title').text.strip() @@ -103,8 +101,8 @@ def get_page_links(endpoint): headers = { 'Referer': UTMExams.host } - html = UTMExams.s.get('%s%s' % (UTMExams.host, endpoint), - headers=headers).text + html = Scraper.get('%s%s' % (UTMExams.host, endpoint), + headers=headers) soup = BeautifulSoup(html, 'html.parser') return [li.find('a')['href'] for li in soup.find('ul', class_='link').find_all('li')] diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py index 02c5c09..f7a0ae0 100644 --- a/uoftscrapers/scrapers/exams/utsc.py +++ b/uoftscrapers/scrapers/exams/utsc.py @@ -1,8 +1,7 @@ from ..utils import Scraper from bs4 import BeautifulSoup -from datetime import datetime from collections import OrderedDict -import requests +from datetime import datetime import pytz @@ -10,7 +9,6 @@ class UTSCExams: """A scraper for UTSC exams.""" host = 'http://www.utsc.utoronto.ca/registrar/examination-schedule' - s = requests.Session() @staticmethod def scrape(location='.'): @@ -21,7 +19,7 @@ def scrape(location='.'): headers = { 'Referer': UTSCExams.host } - html = UTSCExams.s.get('%s' % UTSCExams.host, headers=headers).text + html = Scraper.get('%s' % UTSCExams.host, headers=headers) soup = BeautifulSoup(html, 'html.parser') for table in soup.find_all('table', class_='views-table'): diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 7ea6fa3..f7035ed 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -1,8 +1,7 @@ from ..utils import Scraper from bs4 import BeautifulSoup -from datetime import datetime, date from collections import OrderedDict -import requests +from datetime import datetime, date import pytz @@ -13,7 +12,6 @@ class UTSGExams: """ host = 'http://www.artsci.utoronto.ca/current/exams/' - s = requests.Session() @staticmethod def scrape(location='.', year=None): @@ -29,8 +27,8 @@ def scrape(location='.', year=None): headers = { 'Referer': UTSGExams.host } - html = UTSGExams.s.get('%s%s' % (UTSGExams.host, p), - headers=headers).text + html = Scraper.get('%s%s' % (UTSGExams.host, p), + headers=headers) soup = BeautifulSoup(html, 'html.parser') if not soup.find('table', class_='vertical listing'): diff --git a/uoftscrapers/scrapers/food/__init__.py b/uoftscrapers/scrapers/food/__init__.py index 34533f7..939174b 100644 --- a/uoftscrapers/scrapers/food/__init__.py +++ b/uoftscrapers/scrapers/food/__init__.py @@ -1,7 +1,6 @@ from ..utils import Scraper, LayersScraper from bs4 import BeautifulSoup from collections import OrderedDict -import requests class Food: @@ -12,7 +11,6 @@ class Food: host = 'http://map.utoronto.ca/' campuses = [('utsg', 2), ('utm', 1), ('utsc', 0)] - s = requests.Session() @staticmethod def scrape(location='.'): @@ -101,8 +99,8 @@ def conv_time(t): headers = { 'Referer': Food.host } - html = Food.s.get('%s%s%s' % (Food.host, 'json/hours/', food_id), - headers=headers).text + html = Scraper.get('%s%s%s' % (Food.host, 'json/hours/', food_id), + headers=headers) soup = BeautifulSoup(html, 'html.parser') hours = OrderedDict() diff --git a/uoftscrapers/scrapers/parking/__init__.py b/uoftscrapers/scrapers/parking/__init__.py index af8b016..8bcc054 100644 --- a/uoftscrapers/scrapers/parking/__init__.py +++ b/uoftscrapers/scrapers/parking/__init__.py @@ -3,7 +3,6 @@ from collections import OrderedDict from pprint import pprint import json -import requests class Parking: @@ -18,7 +17,6 @@ class Parking: 'utm': 6, 'utsc': 5 } - s = requests.Session() @staticmethod def scrape(location='.'): diff --git a/uoftscrapers/scrapers/shuttle/__init__.py b/uoftscrapers/scrapers/shuttle/__init__.py index 65a1188..dbe7376 100644 --- a/uoftscrapers/scrapers/shuttle/__init__.py +++ b/uoftscrapers/scrapers/shuttle/__init__.py @@ -35,7 +35,7 @@ def scrape(location='.', month=None): Scraper.logger.info('Fetching schedules for {0}-{1}-01 to {0}-{1}-{2}.'.format(year, month, days)) for day in range(1, days + 1): - html = Scraper.get_html(Shuttle.host % (year, month, day)) + html = Scraper.get(Shuttle.host % (year, month, day)) schedule = Shuttle.parse_schedule_html(html) Scraper.save_json(schedule, location, schedule['date']) diff --git a/uoftscrapers/scrapers/textbooks/__init__.py b/uoftscrapers/scrapers/textbooks/__init__.py index 4c2f4e6..21793b9 100644 --- a/uoftscrapers/scrapers/textbooks/__init__.py +++ b/uoftscrapers/scrapers/textbooks/__init__.py @@ -6,10 +6,8 @@ from queue import Queue from threading import Thread, Lock from time import time -import logging import os import re -import requests import sys @@ -104,9 +102,9 @@ def scrape(location='.'): @staticmethod def retrieve_terms(): - r = requests.get('%s/buy_courselisting.asp' % Textbooks.host) + html = Scraper.get('%s/buy_courselisting.asp' % Textbooks.host) - listing = BeautifulSoup(r.text, "html.parser") + listing = BeautifulSoup(html, "html.parser") terms = listing.find(id='fTerm').find_all('option')[1:] accepted_terms = [] @@ -138,10 +136,10 @@ def retrieve_departments(terms): 'Referer': '%s/buy_courselisting.asp' % Textbooks.host } - r = requests.get('%s/textbooks_xml.asp' % Textbooks.host, + xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, params=payload, headers=headers) - departments = BeautifulSoup(r.text, "xml").find_all('department') + departments = BeautifulSoup(xml, "xml").find_all('department') for department in departments: all_departments.append({ 'dept_id': department.get('id'), @@ -168,10 +166,10 @@ def retrieve_courses(department): 'Referer': '%s/buy_courselisting.asp' % Textbooks.host } - r = requests.get('%s/textbooks_xml.asp' % Textbooks.host, + xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, params=payload, headers=headers) - courses = BeautifulSoup(r.text, "xml").find_all('course') + courses = BeautifulSoup(xml, "xml").find_all('course') for course in courses: all_courses.append({ 'course_id': course.get('id'), @@ -196,10 +194,10 @@ def retrieve_sections(course): 'Referer': '%s/buy_courselisting.asp' % Textbooks.host } - r = requests.get('%s/textbooks_xml.asp' % Textbooks.host, + xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, params=payload, headers=headers) - sections = BeautifulSoup(r.text, "xml").find_all('section') + sections = BeautifulSoup(xml, "xml").find_all('section') for section in sections: all_sections.append({ 'section_id': section.get('id'), @@ -224,10 +222,10 @@ def retrieve_books(section): 'Referer': '%s/buy_courselisting.asp' % Textbooks.host } - r = requests.get('%s/textbooks_xml.asp' % Textbooks.host, + xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, params=payload, headers=headers) - soup = BeautifulSoup(r.text, "html.parser") + soup = BeautifulSoup(xml, "html.parser") books = soup.find_all('tr', { 'class': 'book' }) if books == None: diff --git a/uoftscrapers/scrapers/timetable/utsg.py b/uoftscrapers/scrapers/timetable/utsg.py index 9fd774e..d91eef1 100644 --- a/uoftscrapers/scrapers/timetable/utsg.py +++ b/uoftscrapers/scrapers/timetable/utsg.py @@ -41,7 +41,7 @@ def scrape(location): sponsor.split('.')[0] )) - html = Scraper.get_html('%s/%s/%s' % ( + html = Scraper.get('%s/%s/%s' % ( UTSGTimetable.host, term, sponsor @@ -330,7 +330,7 @@ def format_data(text, regex): @staticmethod def get_sponsors(term): - html = Scraper.get_html('%s/%s/index.html' % ( + html = Scraper.get('%s/%s/index.html' % ( UTSGTimetable.host, term )) diff --git a/uoftscrapers/scrapers/utils/scraper.py b/uoftscrapers/scrapers/utils/scraper.py index 42a058f..4812d24 100644 --- a/uoftscrapers/scrapers/utils/scraper.py +++ b/uoftscrapers/scrapers/utils/scraper.py @@ -1,3 +1,4 @@ +from time import sleep import json import logging import os @@ -27,22 +28,30 @@ def save_json(data, location, filename): json.dump(data, outfile) @staticmethod - def get_html(url, params=None, cookies=None, headers=None, max_attempts=10): - """Fetches the HTML page source, automatically retrying if it times out.""" + def get(url, params=None, cookies=None, headers=None, json=False, max_attempts=10): + """Fetches an Internet document, automatically retrying if it times out.""" - html = None + doc = None attempts = 0 - while html is None and attempts < max_attempts: + while doc is None and attempts < max_attempts: try: r = Scraper.s.get(url, params=params, cookies=cookies, headers=headers) if r.status_code == 200: - html = r.text + doc = r + else: + sleep(0.5) except (requests.exceptions.Timeout, requests.exceptions.ConnectionError): attempts += 1 continue - return html.encode('utf-8') if html else None + if doc is None: + return None + + if json: + return doc.json() + else: + return doc.text.encode('utf-8') @staticmethod def flush_percentage(decimal): From 9b1d3c720a4be9401b630bab676176019baf7fc6 Mon Sep 17 00:00:00 2001 From: Eugene Cheung Date: Sun, 17 Apr 2016 16:42:18 -0400 Subject: [PATCH 2/2] Course cookies --- uoftscrapers/scrapers/courses/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uoftscrapers/scrapers/courses/__init__.py b/uoftscrapers/scrapers/courses/__init__.py index 008ca5f..c73a013 100755 --- a/uoftscrapers/scrapers/courses/__init__.py +++ b/uoftscrapers/scrapers/courses/__init__.py @@ -253,7 +253,7 @@ def __init__(self, queue): def run(self): while True: course_id, url, total = self.queue.get() - html = Scraper.get(url) + html = Scraper.get(url, Courses.cookies) course = Courses.parse_course_html(course_id, html) CourseFinderWorker.lock.acquire()