-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UTEvents #36 #56
UTEvents #36 #56
Changes from 9 commits
6604978
85adbff
0a87bcc
25b3810
40f6e7c
d4b0235
797c05d
2cbfff5
39c656e
17a054c
a73a68c
b3fc5d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from ..scraper import Scraper | ||
from bs4 import BeautifulSoup, NavigableString | ||
from datetime import datetime, date | ||
from collections import OrderedDict | ||
import urllib.parse as urlparse | ||
from urllib.parse import urlencode | ||
import requests | ||
import pytz | ||
import json | ||
import re | ||
|
||
class UTEvents: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can rename this to |
||
"""A scraper for Events at the University of Toronto.""" | ||
host = 'https://www.events.utoronto.ca/' | ||
s = requests.Session() | ||
|
||
@staticmethod | ||
def scrape(location='.'): | ||
Scraper.logger.info('UTEvents initialized.') | ||
Scraper.ensure_location(location) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't needed. |
||
|
||
def scrape_event(doc): | ||
Scraper.logger.info('Scraped event: %s ' % ( | ||
doc['id'], | ||
)) | ||
with open('%s/%s.json' % (location, doc['id']), 'w') as fp: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
json.dump(doc, fp) | ||
|
||
for event_link in UTEvents.get_events_links(): | ||
doc = UTEvents.get_event_doc(event_link) | ||
scrape_event(doc) | ||
|
||
Scraper.logger.info('UTEvents completed.') | ||
|
||
@staticmethod | ||
def get_events_links(): | ||
page_index_url = UTEvents.host + 'index.php' | ||
url_parts = list(urlparse.urlparse(page_index_url)) | ||
events_links = [] | ||
paging_index = 1 | ||
events_count = 10 | ||
while(events_count == 10): | ||
params = { | ||
'p': paging_index | ||
} | ||
url_parts[4] = urlencode(params) | ||
paging_index += 1 | ||
html = UTEvents.s.get(urlparse.urlunparse(url_parts)).text | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
soup = BeautifulSoup(html, 'html.parser') | ||
events_dom_arr = soup.select('#results')[0].find_all('li') | ||
events_count = len(events_dom_arr) | ||
events_links += list(map(lambda e: e.a['href'], events_dom_arr)) | ||
return(events_links) | ||
|
||
@staticmethod | ||
def get_event_doc(url_tail): | ||
event_url = UTEvents.host + url_tail | ||
html = UTEvents.s.get(event_url).text | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
url_parts = list(urlparse.urlparse(event_url)) | ||
query = dict(urlparse.parse_qsl(url_parts[4])) | ||
soup = BeautifulSoup(html, 'html.parser') | ||
|
||
event_id = query['eventid'] | ||
event_title = soup.select('.eventTitle')[0].text.strip() | ||
raw_time = soup.select('.date')[0].text.split(',') | ||
|
||
date_arr = raw_time[0].split(' - ') | ||
time_arr = re.split(' - | ', raw_time[1].strip()) | ||
|
||
# Some of the strings are misformed and gives an extra empty space | ||
time_arr = list(filter(None, time_arr)) | ||
event_start_date = datetime.strptime(date_arr[0], | ||
'%b %d').replace(year=date.today().year).date().isoformat() | ||
event_end_date = datetime.strptime(date_arr[-1], | ||
'%b %d').replace(year=date.today().year).date().isoformat() | ||
|
||
# Note: Some events span across several days e.g. 8350, thus specifying dates makes no sense | ||
event_meridiem = time_arr[2] | ||
event_start_time = time_arr[0] + ' ' + event_meridiem | ||
event_end_time = time_arr[1] + ' ' + event_meridiem | ||
|
||
evt_bar = soup.select('#evt_bar')[0] | ||
event_url = evt_bar.select('dd')[1].a['href'] | ||
event_price = evt_bar.select('dl')[1].dd.text | ||
|
||
event_campus = 'Off Campus' | ||
if evt_bar.select('dd')[0].b != None: | ||
event_campus = evt_bar.select('dd')[0].b.text | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In the case of no campus / off campus, we can just leave |
||
|
||
event_address = '' | ||
address_block = evt_bar.select('dd')[0] | ||
if address_block.a != None: | ||
address_block = address_block.a | ||
for content in address_block.contents: | ||
text = content if type(content) == NavigableString else content.text | ||
event_address += text.strip().replace('\r', '') + ' ' | ||
event_address = event_address.strip() | ||
|
||
event_audiences = list(map(lambda a: a.text, | ||
evt_bar.select('dl')[1].select('dd')[1].select('a'))) | ||
|
||
soup.select('.eventTitle')[0].extract() | ||
soup.select('.date')[0].extract() | ||
evt_bar.extract() | ||
soup.select('#cal_bar')[0].extract() | ||
event_description = '' | ||
for content in soup.select('#content')[0].contents: | ||
text = content if type(content) == NavigableString else content.text | ||
event_description += text.strip().replace('\r', '') + ' ' | ||
event_description = event_description.strip() | ||
|
||
doc = OrderedDict([ | ||
('id', event_id), | ||
('title', event_title), | ||
('start_date', event_start_date), | ||
('end_date', event_end_date), | ||
('start_time', event_start_time), | ||
('end_time', event_end_time), | ||
('url', event_url), | ||
('description', event_description), | ||
('admission_price', event_price), | ||
('campus', event_campus), | ||
('address', event_address), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we rename this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The think location is better, they sometimes include the name of the place in the address. |
||
('audiences', event_audiences), | ||
]) | ||
return doc |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be
from ..utils import Scraper