Skip to content

Commit

Permalink
Merge pull request #55 from arkon/parent-get
Browse files Browse the repository at this point in the history
Use Scraper.get in all scrapers
  • Loading branch information
qasim committed Apr 17, 2016
2 parents e1d388e + 9b1d3c7 commit c2a6c76
Show file tree
Hide file tree
Showing 13 changed files with 51 additions and 86 deletions.
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/athletics/utm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def scrape(location='.', month=None):
month = month or UTMAthletics.get_month(month)

Scraper.logger.info('UTMAthletics initialized.')
html = Scraper.get_html('%s%s' % (UTMAthletics.host, month))
html = Scraper.get('%s%s' % (UTMAthletics.host, month))
soup = BeautifulSoup(html, 'html.parser')

athletics = OrderedDict()
Expand Down
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/athletics/utsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def scrape(location='.', month=None):
month = month or UTSCAthletics.get_month(month)

Scraper.logger.info('UTSCAthletics initialized.')
html = Scraper.get_html('%s%s' % (UTSCAthletics.host, month))
html = Scraper.get('%s%s' % (UTSCAthletics.host, month))
soup = BeautifulSoup(html, 'html.parser')

athletics = OrderedDict()
Expand Down
14 changes: 6 additions & 8 deletions uoftscrapers/scrapers/buildings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import json
import os
import re
import requests


class Buildings:
Expand All @@ -16,7 +15,6 @@ class Buildings:

host = 'http://map.utoronto.ca/'
campuses = ['utsg', 'utm', 'utsc']
s = requests.Session()

@staticmethod
def scrape(location='.'):
Expand Down Expand Up @@ -82,14 +80,14 @@ def get_map_json(campus):

Scraper.logger.info('Scraping %s.' % campus.upper())

Buildings.s.get(Buildings.host)
Scraper.get(Buildings.host)

headers = { 'Referer': Buildings.host }
html = Buildings.s.get('%s%s%s' % (
html = Scraper.get('%s%s%s' % (
Buildings.host,
'data/map/',
campus
), headers=headers).text
), headers=headers)

data = json.loads(html)
return data
Expand All @@ -98,14 +96,14 @@ def get_map_json(campus):
def get_regions_json(campus):
"""Retrieve the JSON structure from host."""

Buildings.s.get(Buildings.host)
Scraper.get(Buildings.host)

headers = { 'Referer': Buildings.host }
html = Buildings.s.get('%s%s%s' % (
html = Scraper.get('%s%s%s' % (
Buildings.host,
'data/regions/',
campus
), headers=headers).text
), headers=headers)

data = json.loads(html)
return data
36 changes: 3 additions & 33 deletions uoftscrapers/scrapers/courses/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from collections import OrderedDict
from queue import Queue
from threading import Thread, Lock
from time import time, sleep
from time import time
import http.cookiejar
import json
import logging
import os
import re
import requests
import sys


Expand All @@ -21,7 +19,6 @@ class Courses:

host = 'http://coursefinder.utoronto.ca/course-search/search'
cookies = http.cookiejar.CookieJar()
s = requests.Session()
threads = 32

@staticmethod
Expand Down Expand Up @@ -74,37 +71,10 @@ def search(query='', requirements=''):
'campusParam': 'St. George,Scarborough,Mississauga'
}

# Keep trying to get data until a proper response is given
json = None
while json is None:
try:
r = Courses.s.get(url, params=data,
cookies=Courses.cookies)
if r.status_code == 200:
json = r.json()
else:
sleep(0.5)
except requests.exceptions.Timeout:
continue
json = Scraper.get(url, params=data, cookies=Courses.cookies, json=True)

return json['aaData']

@staticmethod
def get_course_html(url):
"""Update the locally stored course pages."""

html = None
while html is None:
try:
r = Courses.s.get(url, cookies=Courses.cookies)
if r.status_code == 200:
html = r.text
except (requests.exceptions.Timeout,
requests.exceptions.ConnectionError):
continue

return html.encode('utf-8')

@staticmethod
def parse_course_html(course_id, html):
"""Create JSON files from the HTML pages downloaded."""
Expand Down Expand Up @@ -283,7 +253,7 @@ def __init__(self, queue):
def run(self):
while True:
course_id, url, total = self.queue.get()
html = Courses.get_course_html(url)
html = Scraper.get(url, Courses.cookies)
course = Courses.parse_course_html(course_id, html)

CourseFinderWorker.lock.acquire()
Expand Down
12 changes: 5 additions & 7 deletions uoftscrapers/scrapers/exams/utm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from ..utils import Scraper
from bs4 import BeautifulSoup
from datetime import datetime
from collections import OrderedDict
import requests
from datetime import datetime
import pytz
import re

Expand All @@ -11,7 +10,6 @@ class UTMExams:
"""A scraper for UTM exams."""

host = 'https://m.utm.utoronto.ca/'
s = requests.Session()

@staticmethod
def scrape(location='.'):
Expand Down Expand Up @@ -42,8 +40,8 @@ def retrieve_exams(courses):
headers = {
'Referer': UTMExams.host
}
html = UTMExams.s.get('%s%s' % (UTMExams.host, course),
headers=headers).text
html = Scraper.get('%s%s' % (UTMExams.host, course),
headers=headers)
soup = BeautifulSoup(html, 'html.parser')

course_code = soup.find('div', class_='title').text.strip()
Expand Down Expand Up @@ -103,8 +101,8 @@ def get_page_links(endpoint):
headers = {
'Referer': UTMExams.host
}
html = UTMExams.s.get('%s%s' % (UTMExams.host, endpoint),
headers=headers).text
html = Scraper.get('%s%s' % (UTMExams.host, endpoint),
headers=headers)
soup = BeautifulSoup(html, 'html.parser')
return [li.find('a')['href']
for li in soup.find('ul', class_='link').find_all('li')]
Expand Down
6 changes: 2 additions & 4 deletions uoftscrapers/scrapers/exams/utsc.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
from ..utils import Scraper
from bs4 import BeautifulSoup
from datetime import datetime
from collections import OrderedDict
import requests
from datetime import datetime
import pytz


class UTSCExams:
"""A scraper for UTSC exams."""

host = 'http://www.utsc.utoronto.ca/registrar/examination-schedule'
s = requests.Session()

@staticmethod
def scrape(location='.'):
Expand All @@ -21,7 +19,7 @@ def scrape(location='.'):
headers = {
'Referer': UTSCExams.host
}
html = UTSCExams.s.get('%s' % UTSCExams.host, headers=headers).text
html = Scraper.get('%s' % UTSCExams.host, headers=headers)
soup = BeautifulSoup(html, 'html.parser')

for table in soup.find_all('table', class_='views-table'):
Expand Down
8 changes: 3 additions & 5 deletions uoftscrapers/scrapers/exams/utsg.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from ..utils import Scraper
from bs4 import BeautifulSoup
from datetime import datetime, date
from collections import OrderedDict
import requests
from datetime import datetime, date
import pytz


Expand All @@ -13,7 +12,6 @@ class UTSGExams:
"""

host = 'http://www.artsci.utoronto.ca/current/exams/'
s = requests.Session()

@staticmethod
def scrape(location='.', year=None):
Expand All @@ -29,8 +27,8 @@ def scrape(location='.', year=None):
headers = {
'Referer': UTSGExams.host
}
html = UTSGExams.s.get('%s%s' % (UTSGExams.host, p),
headers=headers).text
html = Scraper.get('%s%s' % (UTSGExams.host, p),
headers=headers)
soup = BeautifulSoup(html, 'html.parser')

if not soup.find('table', class_='vertical listing'):
Expand Down
6 changes: 2 additions & 4 deletions uoftscrapers/scrapers/food/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from ..utils import Scraper, LayersScraper
from bs4 import BeautifulSoup
from collections import OrderedDict
import requests


class Food:
Expand All @@ -12,7 +11,6 @@ class Food:

host = 'http://map.utoronto.ca/'
campuses = [('utsg', 2), ('utm', 1), ('utsc', 0)]
s = requests.Session()

@staticmethod
def scrape(location='.'):
Expand Down Expand Up @@ -101,8 +99,8 @@ def conv_time(t):
headers = {
'Referer': Food.host
}
html = Food.s.get('%s%s%s' % (Food.host, 'json/hours/', food_id),
headers=headers).text
html = Scraper.get('%s%s%s' % (Food.host, 'json/hours/', food_id),
headers=headers)
soup = BeautifulSoup(html, 'html.parser')

hours = OrderedDict()
Expand Down
2 changes: 0 additions & 2 deletions uoftscrapers/scrapers/parking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from collections import OrderedDict
from pprint import pprint
import json
import requests


class Parking:
Expand All @@ -18,7 +17,6 @@ class Parking:
'utm': 6,
'utsc': 5
}
s = requests.Session()

@staticmethod
def scrape(location='.'):
Expand Down
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/shuttle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def scrape(location='.', month=None):
Scraper.logger.info('Fetching schedules for {0}-{1}-01 to {0}-{1}-{2}.'.format(year, month, days))

for day in range(1, days + 1):
html = Scraper.get_html(Shuttle.host % (year, month, day))
html = Scraper.get(Shuttle.host % (year, month, day))
schedule = Shuttle.parse_schedule_html(html)

Scraper.save_json(schedule, location, schedule['date'])
Expand Down
22 changes: 10 additions & 12 deletions uoftscrapers/scrapers/textbooks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
from queue import Queue
from threading import Thread, Lock
from time import time
import logging
import os
import re
import requests
import sys


Expand Down Expand Up @@ -104,9 +102,9 @@ def scrape(location='.'):

@staticmethod
def retrieve_terms():
r = requests.get('%s/buy_courselisting.asp' % Textbooks.host)
html = Scraper.get('%s/buy_courselisting.asp' % Textbooks.host)

listing = BeautifulSoup(r.text, "html.parser")
listing = BeautifulSoup(html, "html.parser")
terms = listing.find(id='fTerm').find_all('option')[1:]

accepted_terms = []
Expand Down Expand Up @@ -138,10 +136,10 @@ def retrieve_departments(terms):
'Referer': '%s/buy_courselisting.asp' % Textbooks.host
}

r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)

departments = BeautifulSoup(r.text, "xml").find_all('department')
departments = BeautifulSoup(xml, "xml").find_all('department')
for department in departments:
all_departments.append({
'dept_id': department.get('id'),
Expand All @@ -168,10 +166,10 @@ def retrieve_courses(department):
'Referer': '%s/buy_courselisting.asp' % Textbooks.host
}

r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)

courses = BeautifulSoup(r.text, "xml").find_all('course')
courses = BeautifulSoup(xml, "xml").find_all('course')
for course in courses:
all_courses.append({
'course_id': course.get('id'),
Expand All @@ -196,10 +194,10 @@ def retrieve_sections(course):
'Referer': '%s/buy_courselisting.asp' % Textbooks.host
}

r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)

sections = BeautifulSoup(r.text, "xml").find_all('section')
sections = BeautifulSoup(xml, "xml").find_all('section')
for section in sections:
all_sections.append({
'section_id': section.get('id'),
Expand All @@ -224,10 +222,10 @@ def retrieve_books(section):
'Referer': '%s/buy_courselisting.asp' % Textbooks.host
}

r = requests.get('%s/textbooks_xml.asp' % Textbooks.host,
xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)

soup = BeautifulSoup(r.text, "html.parser")
soup = BeautifulSoup(xml, "html.parser")
books = soup.find_all('tr', { 'class': 'book' })

if books == None:
Expand Down
4 changes: 2 additions & 2 deletions uoftscrapers/scrapers/timetable/utsg.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def scrape(location):
sponsor.split('.')[0]
))

html = Scraper.get_html('%s/%s/%s' % (
html = Scraper.get('%s/%s/%s' % (
UTSGTimetable.host,
term,
sponsor
Expand Down Expand Up @@ -330,7 +330,7 @@ def format_data(text, regex):

@staticmethod
def get_sponsors(term):
html = Scraper.get_html('%s/%s/index.html' % (
html = Scraper.get('%s/%s/index.html' % (
UTSGTimetable.host,
term
))
Expand Down
Loading

0 comments on commit c2a6c76

Please sign in to comment.