Skip to content

Commit

Permalink
Merge pull request #56 from g3wanghc/master
Browse files Browse the repository at this point in the history
UTEvents #36
  • Loading branch information
qasim committed Apr 19, 2016
2 parents 631d3db + b3fc5d0 commit 385b6fc
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 0 deletions.
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -588,3 +588,33 @@ https://m.utm.utoronto.ca/shuttle.php
}]
}
```

------

### Events

##### Class name
```python
uoftscrapers.Events
```

##### Scraper source
https://www.events.utoronto.ca/

##### Output format
```js
{
id: String,
title: String,
start_date: String
end_date: String,
start_time: String,
end_time: String,
url: String,
description: String,
admission_price: String,
campus: String,
location: String,
audiences: [String],
}
```
2 changes: 2 additions & 0 deletions uoftscrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

from .scrapers.shuttle import Shuttle

from .scrapers.events import Events


class NullHandler(logging.Handler):
def emit(self, record):
Expand Down
115 changes: 115 additions & 0 deletions uoftscrapers/scrapers/events/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from ..utils import Scraper
from bs4 import BeautifulSoup, NavigableString
from datetime import datetime, date
from collections import OrderedDict
import urllib.parse as urlparse
from urllib.parse import urlencode
import re

class Events:
"""A scraper for Events at the University of Toronto."""
host = 'https://www.events.utoronto.ca/'

@staticmethod
def scrape(location='.'):
Scraper.logger.info('Events initialized.')
Scraper.ensure_location(location)

for event_link in Events.get_events_links():
doc = Events.get_event_doc(event_link)
Scraper.save_json(doc, location, doc['id'])

Scraper.logger.info('Events completed.')

@staticmethod
def get_events_links():
page_index_url = Events.host + 'index.php'
url_parts = list(urlparse.urlparse(page_index_url))
events_links = []
paging_index = 1
events_count = 10
while(events_count == 10):
params = {
'p': paging_index
}
url_parts[4] = urlencode(params)
paging_index += 1
html = Scraper.get(urlparse.urlunparse(url_parts))
soup = BeautifulSoup(html, 'html.parser')
events_dom_arr = soup.select('#results')[0].find_all('li')
events_count = len(events_dom_arr)
events_links += list(map(lambda e: e.a['href'], events_dom_arr))
return(events_links)

@staticmethod
def get_event_doc(url_tail):
event_url = Events.host + url_tail
html = Scraper.get(event_url)
url_parts = list(urlparse.urlparse(event_url))
query = dict(urlparse.parse_qsl(url_parts[4]))
soup = BeautifulSoup(html, 'html.parser')

event_id = query['eventid']
event_title = soup.select('.eventTitle')[0].text.strip()
raw_time = soup.select('.date')[0].text.split(',')

date_arr = raw_time[0].split(' - ')
time_arr = re.split(' - | ', raw_time[1].strip())

# Some of the strings are misformed and gives an extra empty space
time_arr = list(filter(None, time_arr))
event_start_date = datetime.strptime(date_arr[0],
'%b %d').replace(year=date.today().year).date().isoformat()
event_end_date = datetime.strptime(date_arr[-1],
'%b %d').replace(year=date.today().year).date().isoformat()

# Note: Some events span across several days e.g. 8350, thus specifying dates makes no sense
event_meridiem = time_arr[2]
event_start_time = time_arr[0] + ' ' + event_meridiem
event_end_time = time_arr[1] + ' ' + event_meridiem

evt_bar = soup.select('#evt_bar')[0]
event_url = evt_bar.select('dd')[1].a['href']
event_price = evt_bar.select('dl')[1].dd.text

event_campus = ''
if evt_bar.select('dd')[0].b != None:
event_campus = evt_bar.select('dd')[0].b.text

event_address = ''
address_block = evt_bar.select('dd')[0]
if address_block.a != None:
address_block = address_block.a
for content in address_block.contents:
text = content if type(content) == NavigableString else content.text
event_address += text.strip().replace('\r', '') + ' '
event_address = event_address.strip()

event_audiences = list(map(lambda a: a.text,
evt_bar.select('dl')[1].select('dd')[1].select('a')))

soup.select('.eventTitle')[0].extract()
soup.select('.date')[0].extract()
evt_bar.extract()
soup.select('#cal_bar')[0].extract()
event_description = ''
for content in soup.select('#content')[0].contents:
text = content if type(content) == NavigableString else content.text
event_description += text.strip().replace('\r', '') + ' '
event_description = event_description.strip()

doc = OrderedDict([
('id', event_id),
('title', event_title),
('start_date', event_start_date),
('end_date', event_end_date),
('start_time', event_start_time),
('end_time', event_end_time),
('url', event_url),
('description', event_description),
('admission_price', event_price),
('campus', event_campus),
('location', event_address),
('audiences', event_audiences),
])
return doc

0 comments on commit 385b6fc

Please sign in to comment.