Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTEvents #36 #56

Merged
merged 12 commits into from
Apr 19, 2016
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -588,3 +588,33 @@ https://m.utm.utoronto.ca/shuttle.php
}]
}
```

------

### UTEvents

##### Class name
```python
uoftscrapers.UTEvents
```

##### Scraper source
https://www.events.utoronto.ca/

##### Output format
```js
{
id: String,
title: String,
start_date: String
end_date: String,
start_time: String,
end_time: String,
url: String,
description: String,
admission_price: String,
campus: String,
address: String,
audiences: [String],
}
```
2 changes: 2 additions & 0 deletions uoftscrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

from .scrapers.shuttle import Shuttle

from .scrapers.events import UTEvents


class NullHandler(logging.Handler):
def emit(self, record):
Expand Down
126 changes: 126 additions & 0 deletions uoftscrapers/scrapers/events/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from ..scraper import Scraper
Copy link
Member

@kashav kashav Apr 19, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be from ..utils import Scraper

from bs4 import BeautifulSoup, NavigableString
from datetime import datetime, date
from collections import OrderedDict
import urllib.parse as urlparse
from urllib.parse import urlencode
import requests
import pytz
import json
import re

class UTEvents:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can rename this to Events to follow convention (since it is for all 3 campuses, there is no prefix).

"""A scraper for Events at the University of Toronto."""
host = 'https://www.events.utoronto.ca/'
s = requests.Session()

@staticmethod
def scrape(location='.'):
Scraper.logger.info('UTEvents initialized.')
Scraper.ensure_location(location)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't needed.


def scrape_event(doc):
Scraper.logger.info('Scraped event: %s ' % (
doc['id'],
))
with open('%s/%s.json' % (location, doc['id']), 'w') as fp:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use Scraper.save_json(...)

json.dump(doc, fp)

for event_link in UTEvents.get_events_links():
doc = UTEvents.get_event_doc(event_link)
scrape_event(doc)

Scraper.logger.info('UTEvents completed.')

@staticmethod
def get_events_links():
page_index_url = UTEvents.host + 'index.php'
url_parts = list(urlparse.urlparse(page_index_url))
events_links = []
paging_index = 1
events_count = 10
while(events_count == 10):
params = {
'p': paging_index
}
url_parts[4] = urlencode(params)
paging_index += 1
html = UTEvents.s.get(urlparse.urlunparse(url_parts)).text
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use Scraper.get(...)

soup = BeautifulSoup(html, 'html.parser')
events_dom_arr = soup.select('#results')[0].find_all('li')
events_count = len(events_dom_arr)
events_links += list(map(lambda e: e.a['href'], events_dom_arr))
return(events_links)

@staticmethod
def get_event_doc(url_tail):
event_url = UTEvents.host + url_tail
html = UTEvents.s.get(event_url).text
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use Scraper.get(...)

url_parts = list(urlparse.urlparse(event_url))
query = dict(urlparse.parse_qsl(url_parts[4]))
soup = BeautifulSoup(html, 'html.parser')

event_id = query['eventid']
event_title = soup.select('.eventTitle')[0].text.strip()
raw_time = soup.select('.date')[0].text.split(',')

date_arr = raw_time[0].split(' - ')
time_arr = re.split(' - | ', raw_time[1].strip())

# Some of the strings are misformed and gives an extra empty space
time_arr = list(filter(None, time_arr))
event_start_date = datetime.strptime(date_arr[0],
'%b %d').replace(year=date.today().year).date().isoformat()
event_end_date = datetime.strptime(date_arr[-1],
'%b %d').replace(year=date.today().year).date().isoformat()

# Note: Some events span across several days e.g. 8350, thus specifying dates makes no sense
event_meridiem = time_arr[2]
event_start_time = time_arr[0] + ' ' + event_meridiem
event_end_time = time_arr[1] + ' ' + event_meridiem

evt_bar = soup.select('#evt_bar')[0]
event_url = evt_bar.select('dd')[1].a['href']
event_price = evt_bar.select('dl')[1].dd.text

event_campus = 'Off Campus'
if evt_bar.select('dd')[0].b != None:
event_campus = evt_bar.select('dd')[0].b.text
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

event_campus should take on a value of UTSG, UTM, or UTSC depending on the campus (instead of St. George, etc.) to follow convention.

In the case of no campus / off campus, we can just leave event_campus = ''


event_address = ''
address_block = evt_bar.select('dd')[0]
if address_block.a != None:
address_block = address_block.a
for content in address_block.contents:
text = content if type(content) == NavigableString else content.text
event_address += text.strip().replace('\r', '') + ' '
event_address = event_address.strip()

event_audiences = list(map(lambda a: a.text,
evt_bar.select('dl')[1].select('dd')[1].select('a')))

soup.select('.eventTitle')[0].extract()
soup.select('.date')[0].extract()
evt_bar.extract()
soup.select('#cal_bar')[0].extract()
event_description = ''
for content in soup.select('#content')[0].contents:
text = content if type(content) == NavigableString else content.text
event_description += text.strip().replace('\r', '') + ' '
event_description = event_description.strip()

doc = OrderedDict([
('id', event_id),
('title', event_title),
('start_date', event_start_date),
('end_date', event_end_date),
('start_time', event_start_time),
('end_time', event_end_time),
('url', event_url),
('description', event_description),
('admission_price', event_price),
('campus', event_campus),
('address', event_address),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we rename this to location instead? It seems like the event_address often has values describing location instead of a well formatted physical address, which could be misleading.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The think location is better, they sometimes include the name of the place in the address.

('audiences', event_audiences),
])
return doc