From 66049788aa7e46bed144e49218d1ccd00fe491a7 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Fri, 15 Apr 2016 11:05:25 -0400 Subject: [PATCH 01/12] Issue #36: Events Scrapper - init --- uoftscrapers/scrapers/events/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 uoftscrapers/scrapers/events/__init__.py diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py new file mode 100644 index 0000000..e69de29 From 85adbffd94f8c94c26356d19666f60b8d070ba17 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Fri, 15 Apr 2016 11:14:51 -0400 Subject: [PATCH 02/12] Base file for the Events scraper --- uoftscrapers/scrapers/events/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index e69de29..f75f471 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -0,0 +1,18 @@ +from ...scraper import Scraper +from datetime import datetime, date +from collections import OrderedDict +import json +import requests + +class UTEvents: + """A scraper for Events at the University of Toronto.""" + + host = 'https://www.events.utoronto.ca/' + s = requests.Session() + + @staticmethod + def scrape(location='.'): + Scraper.logger.info('UTEvents initialized.') + Scraper.logger.info('Not implemented.') + raise NotImplementedError('This scraper has not been implemented yet.') + Scraper.logger.info('UTEvents completed.') From 0a87bccfc2d354aef783689cb52ead7566412299 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Fri, 15 Apr 2016 13:30:15 -0400 Subject: [PATCH 03/12] Import BeautifulSoup - HTML/XML parser and Timezone module --- uoftscrapers/scrapers/events/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index f75f471..d434b8b 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -1,6 +1,9 @@ from ...scraper import Scraper from datetime import datetime, date from collections import OrderedDict +from bs4 import BeautifulSoup +import pytz + import json import requests From 25b3810a326a1bccb6b041b659cdee826e0d836d Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Fri, 15 Apr 2016 13:58:36 -0400 Subject: [PATCH 04/12] Add UTEvents to module --- uoftscrapers/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/uoftscrapers/__init__.py b/uoftscrapers/__init__.py index 8585315..cba1e3f 100644 --- a/uoftscrapers/__init__.py +++ b/uoftscrapers/__init__.py @@ -34,6 +34,8 @@ from .scrapers.shuttle import Shuttle +from .scrapers.events import UTEvents + class NullHandler(logging.Handler): def emit(self, record): From 40f6e7c1c9ba53a9e0a71a071feb63711819866e Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Fri, 15 Apr 2016 13:58:55 -0400 Subject: [PATCH 05/12] Add Campuses references --- uoftscrapers/scrapers/events/__init__.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index d434b8b..2fdefff 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -1,16 +1,23 @@ -from ...scraper import Scraper +from bs4 import BeautifulSoup from datetime import datetime, date from collections import OrderedDict -from bs4 import BeautifulSoup +from ..scraper import Scraper +import requests import pytz - import json -import requests class UTEvents: """A scraper for Events at the University of Toronto.""" host = 'https://www.events.utoronto.ca/' + + campuses = [ + {"id" : 1, "name" : "St. George", "tag" : "utsg"}, + {"id" : 2, "name" : "U of T Mississauga", "tag" : "utm"}, + {"id" : 3, "name" : "U of T Scarborough", "tag" : "utsc"}, + {"id" : 0, "name" : "Off Campus", "tag" : "off"}, + ] + s = requests.Session() @staticmethod From d4b0235395e974ec9fc960e6dbe458ac32f4e686 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 11:18:38 -0400 Subject: [PATCH 06/12] First layer, get the links to all the events --- uoftscrapers/scrapers/events/__init__.py | 30 ++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index 2fdefff..b04f3af 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -6,6 +6,10 @@ import pytz import json + +import urllib.parse as urlparse +from urllib.parse import urlencode + class UTEvents: """A scraper for Events at the University of Toronto.""" @@ -22,7 +26,29 @@ class UTEvents: @staticmethod def scrape(location='.'): + Scraper.logger.info('UTEvents initialized.') - Scraper.logger.info('Not implemented.') - raise NotImplementedError('This scraper has not been implemented yet.') + return UTEvents.get_events_links() Scraper.logger.info('UTEvents completed.') + + @staticmethod + def get_events_links(): + page_index_url = UTEvents.host + "index.php" + url_parts = list(urlparse.urlparse(page_index_url)) + events_links = [] + paging_index = 1 + events_count = 10 + while(events_count == 10): + params = { + 'p': paging_index + } + url_parts[4] = urlencode(params) + paging_index += 1 + html = UTEvents.s.get(urlparse.urlunparse(url_parts)).text + soup = BeautifulSoup(html, 'html.parser') + events_dom_arr = soup.select('#results')[0].find_all('li') + events_count = len(events_dom_arr) + events_links += list(map(lambda e: e.a['href'], events_dom_arr)) + return(events_links) + + From 797c05d6f91682f58659fd1e0f958e492a1b8369 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 15:02:49 -0400 Subject: [PATCH 07/12] Events parsing and json dumps --- uoftscrapers/scrapers/events/__init__.py | 106 +++++++++++++++++++---- 1 file changed, 88 insertions(+), 18 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index b04f3af..03de13c 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -1,39 +1,40 @@ -from bs4 import BeautifulSoup +from ..scraper import Scraper +from bs4 import BeautifulSoup, NavigableString from datetime import datetime, date from collections import OrderedDict -from ..scraper import Scraper +import urllib.parse as urlparse +from urllib.parse import urlencode import requests import pytz import json - - -import urllib.parse as urlparse -from urllib.parse import urlencode +import re class UTEvents: """A scraper for Events at the University of Toronto.""" - host = 'https://www.events.utoronto.ca/' - - campuses = [ - {"id" : 1, "name" : "St. George", "tag" : "utsg"}, - {"id" : 2, "name" : "U of T Mississauga", "tag" : "utm"}, - {"id" : 3, "name" : "U of T Scarborough", "tag" : "utsc"}, - {"id" : 0, "name" : "Off Campus", "tag" : "off"}, - ] - s = requests.Session() @staticmethod def scrape(location='.'): - Scraper.logger.info('UTEvents initialized.') - return UTEvents.get_events_links() + Scraper.ensure_location(location) + + def scrape_event(doc): + Scraper.logger.info('Scraped event: %s ' % ( + doc['id'], + )) + with open('%s/%s.json' % (location, doc['id']), 'w') as fp: + json.dump(doc, fp) + + for event_link in UTEvents.get_events_links(): + doc = UTEvents.get_event_doc(event_link) + scrape_event(doc) + Scraper.logger.info('UTEvents completed.') @staticmethod def get_events_links(): - page_index_url = UTEvents.host + "index.php" + page_index_url = UTEvents.host + 'index.php' url_parts = list(urlparse.urlparse(page_index_url)) events_links = [] paging_index = 1 @@ -51,4 +52,73 @@ def get_events_links(): events_links += list(map(lambda e: e.a['href'], events_dom_arr)) return(events_links) + @staticmethod + def get_event_doc(url_tail): + event_url = UTEvents.host + url_tail + html = UTEvents.s.get(event_url).text + url_parts = list(urlparse.urlparse(event_url)) + query = dict(urlparse.parse_qsl(url_parts[4])) + soup = BeautifulSoup(html, 'html.parser') + + event_id = query['eventid'] + event_title = soup.select('.eventTitle')[0].text + raw_time = soup.select('.date')[0].text.split(',') + + date_arr = raw_time[0].split(' - ') + time_arr = re.split(' - | ', raw_time[1].strip()) + + # Some of the strings are misformed and gives an extra empty space + time_arr = list(filter(None, time_arr)) + event_start_date = datetime.strptime(date_arr[0], + '%b %d').replace(year=date.today().year).date().isoformat() + event_end_date = datetime.strptime(date_arr[-1], + '%b %d').replace(year=date.today().year).date().isoformat() + + # Note: Some events span across several days e.g. 8350, thus specifying dates makes no sense + event_meridiem = time_arr[2] + event_start_time = time_arr[0] + ' ' + event_meridiem + event_end_time = time_arr[1] + ' ' + event_meridiem + + evt_bar = soup.select('#evt_bar')[0] + event_url = evt_bar.select('dd')[1].a['href'] + event_price = evt_bar.select('dl')[1].dd.text + + event_campus = 'Off Campus' + if evt_bar.select('dd')[0].b != None: + event_campus = evt_bar.select('dd')[0].b.text + + event_address = '' + address_block = evt_bar.select('dd')[0] + if address_block.a != None: + address_block = address_block.a + for content in address_block.contents: + text = content if type(content) == NavigableString else content.text + event_address += text.strip().replace('\r', '') + ' ' + + event_audiences = list(map(lambda a: a.text, + evt_bar.select('dl')[1].select('dd')[1].select('a'))) + + soup.select('.eventTitle')[0].extract() + soup.select('.date')[0].extract() + evt_bar.extract() + soup.select('#cal_bar')[0].extract() + event_description = '' + for content in soup.select('#content')[0].contents: + text = content if type(content) == NavigableString else content.text + event_description += text.strip().replace('\r', '') + ' ' + doc = OrderedDict([ + ('id', event_id), + ('title', event_title), + ('start_date', event_start_date), + ('end_date', event_end_date), + ('start_time', event_start_time), + ('end_time', event_end_time), + ('url', event_url), + ('description', event_description), + ('admission_price', event_price), + ('campus', event_campus), + ('address', event_address), + ('audiences', event_audiences), + ]) + return doc \ No newline at end of file From 2cbfff5ca4f99aaf923e747596f0dacae8547a3b Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 15:08:15 -0400 Subject: [PATCH 08/12] Add UTEvents to readme.md --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index 05fbc4f..87cbe2b 100644 --- a/README.md +++ b/README.md @@ -588,3 +588,33 @@ https://m.utm.utoronto.ca/shuttle.php }] } ``` + +------ + +### UTEvents + +##### Class name +```python +uoftscrapers.UTEvents +``` + +##### Scraper source +https://www.events.utoronto.ca/ + +##### Output format +```js +{ + id: String, + title: String, + start_date: String + end_date: String, + start_time: String, + end_time: String, + url: String, + description: String, + admission_price: String, + campus: String, + address: String, + audiences: [String], +} +``` From 39c656e4aeee76c3127698aef08bf0231ce9f0b6 Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 15:13:51 -0400 Subject: [PATCH 09/12] Additional striping of irregular texts --- uoftscrapers/scrapers/events/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index 03de13c..b80d11a 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -61,7 +61,7 @@ def get_event_doc(url_tail): soup = BeautifulSoup(html, 'html.parser') event_id = query['eventid'] - event_title = soup.select('.eventTitle')[0].text + event_title = soup.select('.eventTitle')[0].text.strip() raw_time = soup.select('.date')[0].text.split(',') date_arr = raw_time[0].split(' - ') @@ -94,6 +94,7 @@ def get_event_doc(url_tail): for content in address_block.contents: text = content if type(content) == NavigableString else content.text event_address += text.strip().replace('\r', '') + ' ' + event_address = event_address.strip() event_audiences = list(map(lambda a: a.text, evt_bar.select('dl')[1].select('dd')[1].select('a'))) @@ -106,6 +107,7 @@ def get_event_doc(url_tail): for content in soup.select('#content')[0].contents: text = content if type(content) == NavigableString else content.text event_description += text.strip().replace('\r', '') + ' ' + event_description = event_description.strip() doc = OrderedDict([ ('id', event_id), From 17a054c396f20338b88fb43785a740a53cf5d41c Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 15:34:02 -0400 Subject: [PATCH 10/12] Change naming convention from UTEvents to Events --- README.md | 4 ++-- uoftscrapers/__init__.py | 2 +- uoftscrapers/scrapers/events/__init__.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 87cbe2b..95a85aa 100644 --- a/README.md +++ b/README.md @@ -591,11 +591,11 @@ https://m.utm.utoronto.ca/shuttle.php ------ -### UTEvents +### Events ##### Class name ```python -uoftscrapers.UTEvents +uoftscrapers.Events ``` ##### Scraper source diff --git a/uoftscrapers/__init__.py b/uoftscrapers/__init__.py index cba1e3f..dc64217 100644 --- a/uoftscrapers/__init__.py +++ b/uoftscrapers/__init__.py @@ -34,7 +34,7 @@ from .scrapers.shuttle import Shuttle -from .scrapers.events import UTEvents +from .scrapers.events import Events class NullHandler(logging.Handler): diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index b80d11a..c25872a 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -9,14 +9,14 @@ import json import re -class UTEvents: +class Events: """A scraper for Events at the University of Toronto.""" host = 'https://www.events.utoronto.ca/' s = requests.Session() @staticmethod def scrape(location='.'): - Scraper.logger.info('UTEvents initialized.') + Scraper.logger.info('Events initialized.') Scraper.ensure_location(location) def scrape_event(doc): @@ -26,15 +26,15 @@ def scrape_event(doc): with open('%s/%s.json' % (location, doc['id']), 'w') as fp: json.dump(doc, fp) - for event_link in UTEvents.get_events_links(): - doc = UTEvents.get_event_doc(event_link) + for event_link in Events.get_events_links(): + doc = Events.get_event_doc(event_link) scrape_event(doc) - Scraper.logger.info('UTEvents completed.') + Scraper.logger.info('Events completed.') @staticmethod def get_events_links(): - page_index_url = UTEvents.host + 'index.php' + page_index_url = Events.host + 'index.php' url_parts = list(urlparse.urlparse(page_index_url)) events_links = [] paging_index = 1 @@ -45,7 +45,7 @@ def get_events_links(): } url_parts[4] = urlencode(params) paging_index += 1 - html = UTEvents.s.get(urlparse.urlunparse(url_parts)).text + html = Events.s.get(urlparse.urlunparse(url_parts)).text soup = BeautifulSoup(html, 'html.parser') events_dom_arr = soup.select('#results')[0].find_all('li') events_count = len(events_dom_arr) @@ -54,8 +54,8 @@ def get_events_links(): @staticmethod def get_event_doc(url_tail): - event_url = UTEvents.host + url_tail - html = UTEvents.s.get(event_url).text + event_url = Events.host + url_tail + html = Events.s.get(event_url).text url_parts = list(urlparse.urlparse(event_url)) query = dict(urlparse.parse_qsl(url_parts[4])) soup = BeautifulSoup(html, 'html.parser') From a73a68c929338bdfa24005b259a83667f92c038b Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 15:41:40 -0400 Subject: [PATCH 11/12] Use Scraper built-in functions --- uoftscrapers/scrapers/events/__init__.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index c25872a..fc5f6e7 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -1,34 +1,23 @@ -from ..scraper import Scraper +from ..utils import Scraper from bs4 import BeautifulSoup, NavigableString from datetime import datetime, date from collections import OrderedDict import urllib.parse as urlparse from urllib.parse import urlencode -import requests -import pytz -import json import re class Events: """A scraper for Events at the University of Toronto.""" host = 'https://www.events.utoronto.ca/' - s = requests.Session() @staticmethod def scrape(location='.'): Scraper.logger.info('Events initialized.') Scraper.ensure_location(location) - - def scrape_event(doc): - Scraper.logger.info('Scraped event: %s ' % ( - doc['id'], - )) - with open('%s/%s.json' % (location, doc['id']), 'w') as fp: - json.dump(doc, fp) for event_link in Events.get_events_links(): doc = Events.get_event_doc(event_link) - scrape_event(doc) + Scraper.save_json(doc, location, doc['id']) Scraper.logger.info('Events completed.') @@ -45,7 +34,7 @@ def get_events_links(): } url_parts[4] = urlencode(params) paging_index += 1 - html = Events.s.get(urlparse.urlunparse(url_parts)).text + html = Scraper.get(urlparse.urlunparse(url_parts)) soup = BeautifulSoup(html, 'html.parser') events_dom_arr = soup.select('#results')[0].find_all('li') events_count = len(events_dom_arr) @@ -55,7 +44,7 @@ def get_events_links(): @staticmethod def get_event_doc(url_tail): event_url = Events.host + url_tail - html = Events.s.get(event_url).text + html = Scraper.get(event_url) url_parts = list(urlparse.urlparse(event_url)) query = dict(urlparse.parse_qsl(url_parts[4])) soup = BeautifulSoup(html, 'html.parser') From b3fc5d0da664cda23e93f56c081d2c363baf068b Mon Sep 17 00:00:00 2001 From: Hanchen Wang Date: Tue, 19 Apr 2016 17:55:30 -0400 Subject: [PATCH 12/12] Change address to location and set campus to empty string on Off Campus events --- README.md | 2 +- uoftscrapers/scrapers/events/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 95a85aa..3e5b2e1 100644 --- a/README.md +++ b/README.md @@ -614,7 +614,7 @@ https://www.events.utoronto.ca/ description: String, admission_price: String, campus: String, - address: String, + location: String, audiences: [String], } ``` diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index fc5f6e7..492944e 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -72,7 +72,7 @@ def get_event_doc(url_tail): event_url = evt_bar.select('dd')[1].a['href'] event_price = evt_bar.select('dl')[1].dd.text - event_campus = 'Off Campus' + event_campus = '' if evt_bar.select('dd')[0].b != None: event_campus = evt_bar.select('dd')[0].b.text @@ -109,7 +109,7 @@ def get_event_doc(url_tail): ('description', event_description), ('admission_price', event_price), ('campus', event_campus), - ('address', event_address), + ('location', event_address), ('audiences', event_audiences), ]) return doc \ No newline at end of file