From 42ba10c18fcbe8d8f724879f683600fc4ad124a8 Mon Sep 17 00:00:00 2001 From: Eugene Cheung Date: Tue, 19 Apr 2016 19:17:41 -0400 Subject: [PATCH 1/2] Formatting --- uoftscrapers/__init__.py | 1 + uoftscrapers/scrapers/athletics/__init__.py | 1 + uoftscrapers/scrapers/buildings/__init__.py | 6 +-- uoftscrapers/scrapers/courses/__init__.py | 4 +- uoftscrapers/scrapers/exams/utm.py | 9 +++-- uoftscrapers/scrapers/exams/utsc.py | 3 +- uoftscrapers/scrapers/exams/utsg.py | 2 +- uoftscrapers/scrapers/food/__init__.py | 8 ++-- uoftscrapers/scrapers/shuttle/__init__.py | 12 ++++-- uoftscrapers/scrapers/textbooks/__init__.py | 19 ++++----- uoftscrapers/scrapers/timetable/utsg.py | 43 +++++++++++---------- uoftscrapers/scrapers/utils/scraper.py | 5 ++- 12 files changed, 62 insertions(+), 51 deletions(-) diff --git a/uoftscrapers/__init__.py b/uoftscrapers/__init__.py index dc64217..6a5f4a5 100644 --- a/uoftscrapers/__init__.py +++ b/uoftscrapers/__init__.py @@ -38,6 +38,7 @@ class NullHandler(logging.Handler): + def emit(self, record): pass diff --git a/uoftscrapers/scrapers/athletics/__init__.py b/uoftscrapers/scrapers/athletics/__init__.py index ad42955..f3790a0 100644 --- a/uoftscrapers/scrapers/athletics/__init__.py +++ b/uoftscrapers/scrapers/athletics/__init__.py @@ -5,6 +5,7 @@ class Athletics: + @staticmethod def scrape(location='.'): Scraper.logger.info('Athletics initialized.') diff --git a/uoftscrapers/scrapers/buildings/__init__.py b/uoftscrapers/scrapers/buildings/__init__.py index ca38d20..ce93c03 100644 --- a/uoftscrapers/scrapers/buildings/__init__.py +++ b/uoftscrapers/scrapers/buildings/__init__.py @@ -35,7 +35,7 @@ def scrape(location='.'): lng = LayersScraper.get_value(building, 'lng', True) street = ' '.join(filter(None, - LayersScraper.get_value(building, 'street').split(' '))) + LayersScraper.get_value(building, 'street').split(' '))) city = LayersScraper.get_value(building, 'city') province = LayersScraper.get_value(building, 'province') country = LayersScraper.get_value(building, 'country') @@ -82,7 +82,7 @@ def get_map_json(campus): Scraper.get(Buildings.host) - headers = { 'Referer': Buildings.host } + headers = {'Referer': Buildings.host} html = Scraper.get('%s%s%s' % ( Buildings.host, 'data/map/', @@ -98,7 +98,7 @@ def get_regions_json(campus): Scraper.get(Buildings.host) - headers = { 'Referer': Buildings.host } + headers = {'Referer': Buildings.host} html = Scraper.get('%s%s%s' % ( Buildings.host, 'data/regions/', diff --git a/uoftscrapers/scrapers/courses/__init__.py b/uoftscrapers/scrapers/courses/__init__.py index c73a013..ff04736 100755 --- a/uoftscrapers/scrapers/courses/__init__.py +++ b/uoftscrapers/scrapers/courses/__init__.py @@ -170,7 +170,7 @@ def parse_course_html(course_id, html): times.append(raw_times[i] + " " + raw_times[i + 1]) instructors = BeautifulSoup(str(tds[2]).replace("
", "\n"), - "html.parser") + "html.parser") instructors = instructors.get_text().split("\n") instructors = \ list(filter(None, [x.strip() for x in instructors])) @@ -197,7 +197,7 @@ def parse_course_html(course_id, html): for i in range(len(hours)): x = hours[i].split(':') - hours[i] = int(x[0]) + (int(x[1])/60) + hours[i] = int(x[0]) + (int(x[1]) / 60) time_data.append(OrderedDict([ ("day", day), diff --git a/uoftscrapers/scrapers/exams/utm.py b/uoftscrapers/scrapers/exams/utm.py index 87b2473..dfbccdb 100644 --- a/uoftscrapers/scrapers/exams/utm.py +++ b/uoftscrapers/scrapers/exams/utm.py @@ -41,7 +41,7 @@ def retrieve_exams(courses): 'Referer': UTMExams.host } html = Scraper.get('%s%s' % (UTMExams.host, course), - headers=headers) + headers=headers) soup = BeautifulSoup(html, 'html.parser') course_code = soup.find('div', class_='title').text.strip() @@ -102,7 +102,7 @@ def get_page_links(endpoint): 'Referer': UTMExams.host } html = Scraper.get('%s%s' % (UTMExams.host, endpoint), - headers=headers) + headers=headers) soup = BeautifulSoup(html, 'html.parser') return [li.find('a')['href'] for li in soup.find('ul', class_='link').find_all('li')] @@ -135,7 +135,7 @@ def get_date(month, date, year): @staticmethod def get_course_id(course_code, date): d = datetime.strptime(date, '%Y-%m-%d') - month, year, period = d.strftime("%b").lower(), d.year, UTMExams.get_period(date) + month, year, period = d.strftime('%b').lower(), d.year, UTMExams.get_period(date) endings = { 'dec': { 'F': '%s9' % str(year), @@ -176,7 +176,8 @@ def parse_sections(room): def parse_time(start, end, date): def convert_time(t): h, m, s = [int(x) for x in t.split(':')] - d = datetime.strptime('%s %s %s %s' % (date, h, m, s), '%Y-%m-%d %H %M %S') + d = datetime.strptime('%s %s %s %s' % (date, h, m, s), + '%Y-%m-%d %H %M %S') return d.replace(tzinfo=pytz.timezone('US/Eastern')).isoformat() return convert_time(start), convert_time(end) diff --git a/uoftscrapers/scrapers/exams/utsc.py b/uoftscrapers/scrapers/exams/utsc.py index f7a0ae0..cbaddbc 100644 --- a/uoftscrapers/scrapers/exams/utsc.py +++ b/uoftscrapers/scrapers/exams/utsc.py @@ -95,7 +95,8 @@ def get_date(month, date, year): @staticmethod def get_course_id(course_code, date): d = datetime.strptime(date, '%Y-%m-%d') - month, year, period = d.strftime("%b").lower(), d.year, UTSCExams.get_period(date) + month, year, period = d.strftime( + "%b").lower(), d.year, UTSCExams.get_period(date) endings = { 'dec': { 'F': '%s9' % str(year), diff --git a/uoftscrapers/scrapers/exams/utsg.py b/uoftscrapers/scrapers/exams/utsg.py index f7035ed..11c7966 100644 --- a/uoftscrapers/scrapers/exams/utsg.py +++ b/uoftscrapers/scrapers/exams/utsg.py @@ -28,7 +28,7 @@ def scrape(location='.', year=None): 'Referer': UTSGExams.host } html = Scraper.get('%s%s' % (UTSGExams.host, p), - headers=headers) + headers=headers) soup = BeautifulSoup(html, 'html.parser') if not soup.find('table', class_='vertical listing'): diff --git a/uoftscrapers/scrapers/food/__init__.py b/uoftscrapers/scrapers/food/__init__.py index 939174b..50ba555 100644 --- a/uoftscrapers/scrapers/food/__init__.py +++ b/uoftscrapers/scrapers/food/__init__.py @@ -28,7 +28,7 @@ def scrape(location='.'): building_id = LayersScraper.get_value(entry, 'building_code') address = ' '.join(filter(None, - LayersScraper.get_value(entry, 'address').split())) + LayersScraper.get_value(entry, 'address').split())) hours = Food.get_hours(id_) short_name = LayersScraper.get_value(entry, 'slug') @@ -38,7 +38,7 @@ def scrape(location='.'): 'html.parser').text tags = list(filter(None, - LayersScraper.get_value(entry, 'tags').lower().split(', '))) + LayersScraper.get_value(entry, 'tags').lower().split(', '))) image = LayersScraper.get_value(entry, 'image') lat = LayersScraper.get_value(entry, 'lat', True) @@ -85,7 +85,7 @@ def conv_time(t): # for mistyped times (i.e. http://map.utoronto.ca/json/hours/1329) if t[0] == ':': - time = time[1:len(time)-2] + ':' + time[-2:] + time = time[1:len(time) - 2] + ':' + time[-2:] m = 0 if ':' in time: @@ -100,7 +100,7 @@ def conv_time(t): 'Referer': Food.host } html = Scraper.get('%s%s%s' % (Food.host, 'json/hours/', food_id), - headers=headers) + headers=headers) soup = BeautifulSoup(html, 'html.parser') hours = OrderedDict() diff --git a/uoftscrapers/scrapers/shuttle/__init__.py b/uoftscrapers/scrapers/shuttle/__init__.py index dbe7376..fb643a8 100644 --- a/uoftscrapers/scrapers/shuttle/__init__.py +++ b/uoftscrapers/scrapers/shuttle/__init__.py @@ -32,7 +32,8 @@ def scrape(location='.', month=None): month = now.strftime('%m') if month is None else str(month).zfill(2) days = monthrange(int(year), int(month))[1] - Scraper.logger.info('Fetching schedules for {0}-{1}-01 to {0}-{1}-{2}.'.format(year, month, days)) + Scraper.logger.info( + 'Fetching schedules for {0}-{1}-01 to {0}-{1}-{2}.'.format(year, month, days)) for day in range(1, days + 1): html = Scraper.get(Shuttle.host % (year, month, day)) @@ -49,7 +50,8 @@ def parse_schedule_html(html): soup = BeautifulSoup(html, 'html.parser') # Get date - date = time.strftime('%Y-%m-%d', time.strptime(soup.find('h2').get_text().strip(), '%b %d %Y')) + date = time.strftime( + '%Y-%m-%d', time.strptime(soup.find('h2').get_text().strip(), '%b %d %Y')) # Get route data routes = {} @@ -65,7 +67,8 @@ def parse_schedule_html(html): times = [] for _route_time in _route_times: _route_time_text = _route_time.get_text().strip().lower() - _route_time_clean = re.sub('\*.*\*', '', _route_time_text).strip() + _route_time_clean = re.sub( + '\*.*\*', '', _route_time_text).strip() time_rush_hour = 'rush hour' in _route_time_text time_no_overload = 'no overload' in _route_time_text @@ -73,7 +76,8 @@ def parse_schedule_html(html): times.append(OrderedDict([ ('time', '%sT%s:00-04:00' % ( date, - time.strftime('%H:%M %p', time.strptime(_route_time_clean, '%I:%M %p'))[:-3] + time.strftime('%H:%M %p', time.strptime( + _route_time_clean, '%I:%M %p'))[:-3] )), ('rush_hour', time_rush_hour), ('no_overload', time_no_overload) diff --git a/uoftscrapers/scrapers/textbooks/__init__.py b/uoftscrapers/scrapers/textbooks/__init__.py index 21793b9..71ad746 100644 --- a/uoftscrapers/scrapers/textbooks/__init__.py +++ b/uoftscrapers/scrapers/textbooks/__init__.py @@ -93,8 +93,8 @@ def scrape(location='.'): book['courses'] = sorted(book['courses'], key=itemgetter('id')) for i in range(len(book['courses'])): book['courses'][i]['meeting_sections'] = \ - sorted(book['courses'][i]['meeting_sections'], - key=itemgetter('code')) + sorted(book['courses'][i]['meeting_sections'], + key=itemgetter('code')) Scraper.save_json(book, location, book['id']) @@ -111,7 +111,7 @@ def retrieve_terms(): for term in terms: val = term.get_text() if val.startswith('ST GEORGE') or val.startswith('MISSISSAUGA') \ - or val.startswith('SCARBOROUGH'): + or val.startswith('SCARBOROUGH'): accepted_terms.append(term) return accepted_terms @@ -137,7 +137,7 @@ def retrieve_departments(terms): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers) departments = BeautifulSoup(xml, "xml").find_all('department') for department in departments: @@ -148,7 +148,8 @@ def retrieve_departments(terms): 'session': session }) - Scraper.logger.info('Retreived department info from %s.' % term_name) + Scraper.logger.info( + 'Retreived department info from %s.' % term_name) return all_departments @@ -167,7 +168,7 @@ def retrieve_courses(department): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers) courses = BeautifulSoup(xml, "xml").find_all('course') for course in courses: @@ -195,7 +196,7 @@ def retrieve_sections(course): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers) sections = BeautifulSoup(xml, "xml").find_all('section') for section in sections: @@ -223,10 +224,10 @@ def retrieve_books(section): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers) soup = BeautifulSoup(xml, "html.parser") - books = soup.find_all('tr', { 'class': 'book' }) + books = soup.find_all('tr', {'class': 'book'}) if books == None: done += 1 diff --git a/uoftscrapers/scrapers/timetable/utsg.py b/uoftscrapers/scrapers/timetable/utsg.py index d91eef1..352d8ca 100644 --- a/uoftscrapers/scrapers/timetable/utsg.py +++ b/uoftscrapers/scrapers/timetable/utsg.py @@ -93,15 +93,15 @@ def parse_sponsor(html, year, term, sponsor=''): if len(tds) >= 9: course_code = UTSGTimetable.format_data(tds[0].get_text(), - '([A-Z]{3}[0-9]{3}[HY]1)') + '([A-Z]{3}[0-9]{3}[HY]1)') if len(course_code) > 0: course_info.append(current_course) current_course = [ - None, # course code - None, # name - OrderedDict([]), # sections + None, # course code + None, # name + OrderedDict([]), # sections [] ] @@ -112,17 +112,17 @@ def parse_sponsor(html, year, term, sponsor=''): current_course[1] = name section = UTSGTimetable.format_data(tds[3].get_text(), - '([LTP][0-9]{4})') + '([LTP][0-9]{4})') if len(section) > 0: current_section = section time = UTSGTimetable.format_data(tds[5].get_text(), - '([MTWRFS]{1,3}[0-9]{1,2}(?::[0-9]' + - '{2})?(?:-[0-9]{1,2}(?::[0-9]{2})?)?)') + '([MTWRFS]{1,3}[0-9]{1,2}(?::[0-9]' + + '{2})?(?:-[0-9]{1,2}(?::[0-9]{2})?)?)') location = UTSGTimetable.format_data(tds[6].get_text(), - '([A-Z]{2,4}[ ]?[0-9]{1,8})') + '([A-Z]{2,4}[ ]?[0-9]{1,8})') instructors = tds[7].get_text().strip() @@ -132,7 +132,8 @@ def parse_sponsor(html, year, term, sponsor=''): if len(instructors) > 0: instructors = instructors.split('/') for i in range(len(instructors)): - instructors[i] = ' '.join([x.strip() for x in instructors[i].split('.')]) + instructors[i] = ' '.join( + [x.strip() for x in instructors[i].split('.')]) else: instructors = [] @@ -152,37 +153,37 @@ def parse_sponsor(html, year, term, sponsor=''): if tds[0].get('colspan') == '6': course_code = \ UTSGTimetable.format_data(tds[0].get_text(), - '([A-Z]{3}[0-9]{3}[HY]{1}1[YFS]{1})') + '([A-Z]{3}[0-9]{3}[HY]{1}1[YFS]{1})') breadths = [int(x) for x in re.findall('(?:\()([12345])(?:\))', - tds[0].get_text().strip())] + tds[0].get_text().strip())] name = ''.join(tds[0].get_text() - .replace('Categories ', ':') - .replace('Categories:', ':') - .split(':')[1:]).split(', Count')[0].strip() + .replace('Categories ', ':') + .replace('Categories:', ':') + .split(':')[1:]).split(', Count')[0].strip() if len(course_code) > 0: course_info.append(current_course) current_course = [ course_code, # course code name, # name - OrderedDict([]), # sections + OrderedDict([]), # sections breadths ] else: section = UTSGTimetable.format_data(tds[0].get_text(), - '([LTP][0-9]{4})') + '([LTP][0-9]{4})') if len(section) > 0: current_section = section time = UTSGTimetable.format_data(tds[3].get_text(), - '([MTWRFS]{1,3}[0-9]{1,2}' + - '(?::[0-9]{2})?(?:-[0-9]{1,2}' + - '(?::[0-9]{2})?)?)') + '([MTWRFS]{1,3}[0-9]{1,2}' + + '(?::[0-9]{2})?(?:-[0-9]{1,2}' + + '(?::[0-9]{2})?)?)') location = UTSGTimetable.format_data(tds[4].get_text(), - '([A-Z]{2,4}[ ]?[0-9]{1,8})') + '([A-Z]{2,4}[ ]?[0-9]{1,8})') instructors = tds[5].get_text().strip() @@ -198,7 +199,7 @@ def parse_sponsor(html, year, term, sponsor=''): try: if not isinstance(current_course[2][current_section], - list): + list): current_course[2][current_section] = [] except KeyError: current_course[2][current_section] = [] diff --git a/uoftscrapers/scrapers/utils/scraper.py b/uoftscrapers/scrapers/utils/scraper.py index 4812d24..bc1981a 100644 --- a/uoftscrapers/scrapers/utils/scraper.py +++ b/uoftscrapers/scrapers/utils/scraper.py @@ -24,7 +24,7 @@ def ensure_location(location): def save_json(data, location, filename): Scraper.ensure_location(location) - with open('%s/%s.json' % (location, filename),'w+') as outfile: + with open('%s/%s.json' % (location, filename), 'w+') as outfile: json.dump(data, outfile) @staticmethod @@ -35,7 +35,8 @@ def get(url, params=None, cookies=None, headers=None, json=False, max_attempts=1 attempts = 0 while doc is None and attempts < max_attempts: try: - r = Scraper.s.get(url, params=params, cookies=cookies, headers=headers) + r = Scraper.s.get(url, params=params, + cookies=cookies, headers=headers) if r.status_code == 200: doc = r else: From 1df21a31c762a5ef33fb1fbd6b0614c8db757c5a Mon Sep 17 00:00:00 2001 From: Eugene Cheung Date: Tue, 19 Apr 2016 19:21:13 -0400 Subject: [PATCH 2/2] Formatting --- uoftscrapers/scrapers/events/__init__.py | 53 +++++++++++++----------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index 492944e..1ab16fa 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -1,10 +1,11 @@ from ..utils import Scraper from bs4 import BeautifulSoup, NavigableString -from datetime import datetime, date from collections import OrderedDict -import urllib.parse as urlparse +from datetime import datetime, date from urllib.parse import urlencode import re +import urllib.parse as urlparse + class Events: """A scraper for Events at the University of Toronto.""" @@ -14,7 +15,7 @@ class Events: def scrape(location='.'): Scraper.logger.info('Events initialized.') Scraper.ensure_location(location) - + for event_link in Events.get_events_links(): doc = Events.get_event_doc(event_link) Scraper.save_json(doc, location, doc['id']) @@ -27,11 +28,12 @@ def get_events_links(): url_parts = list(urlparse.urlparse(page_index_url)) events_links = [] paging_index = 1 - events_count = 10 - while(events_count == 10): + events_count = 10 + + while events_count == 10: params = { - 'p': paging_index - } + 'p': paging_index + } url_parts[4] = urlencode(params) paging_index += 1 html = Scraper.get(urlparse.urlunparse(url_parts)) @@ -39,7 +41,8 @@ def get_events_links(): events_dom_arr = soup.select('#results')[0].find_all('li') events_count = len(events_dom_arr) events_links += list(map(lambda e: e.a['href'], events_dom_arr)) - return(events_links) + + return events_links @staticmethod def get_event_doc(url_tail): @@ -52,41 +55,43 @@ def get_event_doc(url_tail): event_id = query['eventid'] event_title = soup.select('.eventTitle')[0].text.strip() raw_time = soup.select('.date')[0].text.split(',') - + date_arr = raw_time[0].split(' - ') time_arr = re.split(' - | ', raw_time[1].strip()) - + # Some of the strings are misformed and gives an extra empty space time_arr = list(filter(None, time_arr)) event_start_date = datetime.strptime(date_arr[0], - '%b %d').replace(year=date.today().year).date().isoformat() - event_end_date = datetime.strptime(date_arr[-1], - '%b %d').replace(year=date.today().year).date().isoformat() + '%b %d').replace(year=date.today().year).date().isoformat() + event_end_date = datetime.strptime(date_arr[-1], + '%b %d').replace(year=date.today().year).date().isoformat() - # Note: Some events span across several days e.g. 8350, thus specifying dates makes no sense + # Note: Some events span across several days e.g. 8350, thus specifying + # dates makes no sense event_meridiem = time_arr[2] event_start_time = time_arr[0] + ' ' + event_meridiem event_end_time = time_arr[1] + ' ' + event_meridiem - + evt_bar = soup.select('#evt_bar')[0] event_url = evt_bar.select('dd')[1].a['href'] event_price = evt_bar.select('dl')[1].dd.text event_campus = '' - if evt_bar.select('dd')[0].b != None: + if evt_bar.select('dd')[0].b is not None: event_campus = evt_bar.select('dd')[0].b.text event_address = '' address_block = evt_bar.select('dd')[0] - if address_block.a != None: + if address_block.a is not None: address_block = address_block.a for content in address_block.contents: - text = content if type(content) == NavigableString else content.text + text = content if type( + content) == NavigableString else content.text event_address += text.strip().replace('\r', '') + ' ' event_address = event_address.strip() - event_audiences = list(map(lambda a: a.text, - evt_bar.select('dl')[1].select('dd')[1].select('a'))) + event_audiences = list(map(lambda a: a.text, + evt_bar.select('dl')[1].select('dd')[1].select('a'))) soup.select('.eventTitle')[0].extract() soup.select('.date')[0].extract() @@ -94,11 +99,12 @@ def get_event_doc(url_tail): soup.select('#cal_bar')[0].extract() event_description = '' for content in soup.select('#content')[0].contents: - text = content if type(content) == NavigableString else content.text + text = content if type( + content) == NavigableString else content.text event_description += text.strip().replace('\r', '') + ' ' event_description = event_description.strip() - doc = OrderedDict([ + return OrderedDict([ ('id', event_id), ('title', event_title), ('start_date', event_start_date), @@ -110,6 +116,5 @@ def get_event_doc(url_tail): ('admission_price', event_price), ('campus', event_campus), ('location', event_address), - ('audiences', event_audiences), + ('audiences', event_audiences) ]) - return doc \ No newline at end of file