From f96989d067f6fd073d04f96bdf2ae314c9b02d49 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 20 Apr 2016 01:00:16 -0400 Subject: [PATCH 1/4] Use request helper function in LayersScraper --- uoftscrapers/scrapers/utils/layers.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/uoftscrapers/scrapers/utils/layers.py b/uoftscrapers/scrapers/utils/layers.py index cb70b66..289a840 100644 --- a/uoftscrapers/scrapers/utils/layers.py +++ b/uoftscrapers/scrapers/utils/layers.py @@ -10,7 +10,6 @@ class LayersScraper: """ host = 'http://map.utoronto.ca/' - s = requests.Session() @staticmethod def get_layers_json(campus): @@ -18,16 +17,13 @@ def get_layers_json(campus): Scraper.logger.info('Retrieving map layers for %s.' % campus.upper()) - headers = { - 'Referer': LayersScraper.host - } - html = LayersScraper.s.get('%s%s%s' % ( + headers = {'Referer': LayersScraper.host} + data = Scraper.get('%s%s%s' % ( LayersScraper.host, 'data/map/', campus - ), headers=headers).text + ), headers=headers, json=True) - data = json.loads(html) return data['layers'] @staticmethod From 095f5772d8c494f46ddf2a9d17492030bcafcd09 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 20 Apr 2016 01:00:55 -0400 Subject: [PATCH 2/4] Patch json getters for Buildings --- uoftscrapers/scrapers/buildings/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/uoftscrapers/scrapers/buildings/__init__.py b/uoftscrapers/scrapers/buildings/__init__.py index ce93c03..b5716fe 100644 --- a/uoftscrapers/scrapers/buildings/__init__.py +++ b/uoftscrapers/scrapers/buildings/__init__.py @@ -2,7 +2,6 @@ from bs4 import BeautifulSoup from collections import OrderedDict from decimal import * -import json import os import re @@ -83,13 +82,12 @@ def get_map_json(campus): Scraper.get(Buildings.host) headers = {'Referer': Buildings.host} - html = Scraper.get('%s%s%s' % ( + data = Scraper.get('%s%s%s' % ( Buildings.host, 'data/map/', campus - ), headers=headers) + ), headers=headers, json=True) - data = json.loads(html) return data @staticmethod @@ -99,11 +97,10 @@ def get_regions_json(campus): Scraper.get(Buildings.host) headers = {'Referer': Buildings.host} - html = Scraper.get('%s%s%s' % ( + data = Scraper.get('%s%s%s' % ( Buildings.host, 'data/regions/', campus - ), headers=headers) + ), headers=headers, json=True) - data = json.loads(html) return data From 26fe823a9dc214abb24d14d138b45a8a6be64727 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 20 Apr 2016 01:22:58 -0400 Subject: [PATCH 3/4] Fix infinite loop in request helper --- uoftscrapers/scrapers/utils/scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/uoftscrapers/scrapers/utils/scraper.py b/uoftscrapers/scrapers/utils/scraper.py index bc1981a..64e5bda 100644 --- a/uoftscrapers/scrapers/utils/scraper.py +++ b/uoftscrapers/scrapers/utils/scraper.py @@ -41,6 +41,7 @@ def get(url, params=None, cookies=None, headers=None, json=False, max_attempts=1 doc = r else: sleep(0.5) + attempts += 1 except (requests.exceptions.Timeout, requests.exceptions.ConnectionError): attempts += 1 From fac88e87835f51dfa730ef3927bc49e4677647f3 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Wed, 20 Apr 2016 01:23:20 -0400 Subject: [PATCH 4/4] Add max attempts for exam requests --- uoftscrapers/scrapers/exams/utsg.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index 11c7966..2ba0fe0 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -28,10 +28,15 @@ def scrape(location='.', year=None): 'Referer': UTSGExams.host } html = Scraper.get('%s%s' % (UTSGExams.host, p), - headers=headers) - soup = BeautifulSoup(html, 'html.parser') + headers=headers, + max_attempts=3) - if not soup.find('table', class_='vertical listing'): + try: + soup = BeautifulSoup(html, 'html.parser') + except TypeError: + soup = None + + if not (html and soup and soup.find(class_='vertical listing')): # no exam data available Scraper.logger.info('No %s exams.' % p.upper()) continue