Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dates Scraper #82

Merged
merged 11 commits into from
May 16, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ This is a library of scrapers for various University of Toronto websites. It is
- [Shuttle Bus Schedule](#shuttles)
- [Events](#events)
- [Libraries](#libraries)
- [Dates](#Dates)
- [UTSG Dates](#utsg-dates)
- [UTM Dates](#utm-dates)

## Requirements
- [python3](https://www.python.org/download/releases/3.5.1)
Expand Down Expand Up @@ -692,3 +695,62 @@ https://onesearch.library.utoronto.ca/
}
}
```

--------------------------------------------------------------------------------

### Dates

##### Class name
```python
uoftscrapers.Dates
```

##### Scraper source
- [UTSG Dates](#utsg-dates)
- [UTM Dates](#utm-dates)

##### Output format
```js
{
"date": String,
"events": [{
"end_date": String,
"session": String,
"campus": String,
"description": String
}]
}
```

----------------------------------------

### UTSG Dates

##### Class name
```python
uoftscrapers.UTSGDates
```

##### Scraper source
http://www.artsci.utoronto.ca/current/course/timetable/
http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm

##### Output format
Refer to [Exams](#exams)

--------------------

### UTM Dates

##### Class name
```python
uoftscrapers.UTMDates
```

##### Scraper source
http://m.utm.utoronto.ca/importantDates.php

##### Output format
Refer to [Exams](#exams)

--------------------------------------------------------------------------------
2 changes: 2 additions & 0 deletions uoftscrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@

from .scrapers.libraries import Libraries

from .scrapers.dates import Dates

class NullHandler(logging.Handler):

def emit(self, record):
Expand Down
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/athletics/utm.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@ def scrape(location='.', month=None, save=True):
Scraper.save_json(doc, location, id_)

Scraper.logger.info('UTMAthletics completed.')
return athletics
return athletics if not save else None
3 changes: 2 additions & 1 deletion uoftscrapers/scrapers/athletics/utsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,5 @@ def scrape(location='.', month=None, save=True):
Scraper.save_json(doc, location, id_)

Scraper.logger.info('UTSCAthletics completed.')
return athletics

return athletics if not save else None
70 changes: 4 additions & 66 deletions uoftscrapers/scrapers/calendar/utm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,76 +4,14 @@
import json
import os
import requests
import datetime


class UTMCalendar:
'''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates
'''

link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
sessionNumber = [5, 9]
@staticmethod
def scrape(location='.', year=None): #scrapes most current sessions by default

year = year or datetime.datetime.now().year
host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/'

currentSession = "{0} SUMMER"
calendar = OrderedDict()
@staticmethod
def scrape(location='.'):
Scraper.logger.info('UTMCalendar initialized.')
for session in UTMCalendar.sessionNumber:
html = Scraper.get(UTMCalendar.link.format(year, session))
soup = BeautifulSoup(html, 'html.parser')
content = soup.find('div', class_='content')
dates = content.find_all('div', class_='title')
i = 0
currentDate = dates[i]
while(i<len(dates)):
date = dates[i].text
events = []
while (currentDate == dates[i]):
info = dates[i].find_next('div', class_='info')
description = info.text
eventStartEnd = date.split('-') #splits event dates over a period
eventStart = UTMCalendar.convert_date(eventStartEnd[0].strip())
if len(eventStartEnd)>1:
eventEnd = UTMCalendar.convert_date(eventStartEnd[1].strip())
else:
eventEnd = eventStart

events.append(OrderedDict([
('end_date', eventEnd),
('session', currentSession.format(UTMCalendar.get_year_from(eventStart))),
('campus', 'UTM'),
('description', description)
]))
i+=1
if(i>=len(dates)):
break;
calendar[date] = OrderedDict([
('date', eventStart),
('events', events)
])
if(i<len(dates)):
currentDate = dates[i]
currentSession = "{0} FALL/WINTER"


for date, info in calendar.items():
Scraper.save_json(info, location, UTMCalendar.convert_date(date))

Scraper.logger.info('Not implemented.')
Scraper.logger.info('UTMCalendar completed.')
return calendar

@staticmethod
def convert_date(date):
splitDate = date.split(' ')
year = splitDate[2]
day = splitDate[1].strip(',')
month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m')
return("{0}-{1}-{2}".format(year, month, day.zfill(2)))

@staticmethod
def get_year_from(date):
splitDate = date.split('-')
return splitDate[0]
33 changes: 33 additions & 0 deletions uoftscrapers/scrapers/dates/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from ..utils import Scraper
from .utsg import UTSGDates
from .utm import UTMDates

from collections import OrderedDict


class Dates:

@staticmethod
def scrape(location='.', year=None):
Scraper.logger.info('Dates initialized.')

docs = OrderedDict()

for campus in UTSGDates, UTMDates:
dates = campus.scrape(location, year=year, save=False)

if dates is None:
continue

for date, doc in dates.items():
if date not in docs:
docs[date] = OrderedDict([
('date', date),
('events', [])
])
docs[date]['events'].extend(doc['events'])

for date, doc in docs.items():
Scraper.save_json(doc, location, date)

Scraper.logger.info('Dates completed.')
79 changes: 79 additions & 0 deletions uoftscrapers/scrapers/dates/utm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from ..utils import Scraper
from bs4 import BeautifulSoup
from collections import OrderedDict
import json
import os
import requests
import datetime


class UTMDates:
'''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates
'''

link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
sessionNumber = [5, 9]
@staticmethod
def scrape(location='.', year=None, save=True): # scrapes most current sessions by default

year = year or datetime.datetime.now().year

currentSession = "{0} SUMMER"
calendar = OrderedDict()
Scraper.logger.info('UTMDates initialized.')
for session in UTMDates.sessionNumber:
html = Scraper.get(UTMDates.link.format(year, session))
soup = BeautifulSoup(html, 'html.parser')
content = soup.find('div', class_='content')
dates = content.find_all('div', class_='title')
i = 0
currentDate = dates[i]
while(i < len(dates)):
date = dates[i].text
events = []
while (currentDate == dates[i]):
info = dates[i].find_next('div', class_='info')
description = info.text
eventStartEnd = date.split('-') # splits event dates over a period
eventStart = UTMDates.convert_date(eventStartEnd[0].strip())
if len(eventStartEnd) > 1:
eventEnd = UTMDates.convert_date(eventStartEnd[1].strip())
else:
eventEnd = eventStart

events.append(OrderedDict([
('end_date', eventEnd),
('session', currentSession.format(UTMDates.get_year_from(eventStart))),
('campus', 'UTM'),
('description', description)
]))
i += 1
if(i >= len(dates)):
break
calendar[eventStart] = OrderedDict([
('date', eventStart),
('events', events)
])
if(i < len(dates)):
currentDate = dates[i]
currentSession = "{0} FALL/WINTER"

if save:
for date, info in calendar.items():
Scraper.save_json(info, location, UTMDates.convert_date(date))

Scraper.logger.info('UTMDates completed.')
return calendar if not save else None

@staticmethod
def convert_date(date):
splitDate = date.split(' ')
year = splitDate[2]
day = splitDate[1].strip(',')
month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m')
return("{0}-{1}-{2}".format(year, month, day.zfill(2)))

@staticmethod
def get_year_from(date):
splitDate = date.split('-')
return splitDate[0]
Loading