Skip to content

Commit

Permalink
Merge pull request #82 from kshvmdn/add-utsg-dates
Browse files Browse the repository at this point in the history
Dates Scraper
  • Loading branch information
qasim committed May 16, 2016
2 parents 2064067 + 2a432e2 commit 46a0904
Show file tree
Hide file tree
Showing 9 changed files with 506 additions and 70 deletions.
62 changes: 62 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ This is a library of scrapers for various University of Toronto websites. It is
- [Shuttle Bus Schedule](#shuttles)
- [Events](#events)
- [Libraries](#libraries)
- [Dates](#Dates)
- [UTSG Dates](#utsg-dates)
- [UTM Dates](#utm-dates)

## Requirements
- [python3](https://www.python.org/download/releases/3.5.1)
Expand Down Expand Up @@ -692,3 +695,62 @@ https://onesearch.library.utoronto.ca/
}
}
```

--------------------------------------------------------------------------------

### Dates

##### Class name
```python
uoftscrapers.Dates
```

##### Scraper source
- [UTSG Dates](#utsg-dates)
- [UTM Dates](#utm-dates)

##### Output format
```js
{
"date": String,
"events": [{
"end_date": String,
"session": String,
"campus": String,
"description": String
}]
}
```

----------------------------------------

### UTSG Dates

##### Class name
```python
uoftscrapers.UTSGDates
```

##### Scraper source
http://www.artsci.utoronto.ca/current/course/timetable/
http://www.undergrad.engineering.utoronto.ca/About/Dates_Deadlines.htm

##### Output format
Refer to [Exams](#exams)

--------------------

### UTM Dates

##### Class name
```python
uoftscrapers.UTMDates
```

##### Scraper source
http://m.utm.utoronto.ca/importantDates.php

##### Output format
Refer to [Exams](#exams)

--------------------------------------------------------------------------------
2 changes: 2 additions & 0 deletions uoftscrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@

from .scrapers.libraries import Libraries

from .scrapers.dates import Dates

class NullHandler(logging.Handler):

def emit(self, record):
Expand Down
2 changes: 1 addition & 1 deletion uoftscrapers/scrapers/athletics/utm.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@ def scrape(location='.', month=None, save=True):
Scraper.save_json(doc, location, id_)

Scraper.logger.info('UTMAthletics completed.')
return athletics
return athletics if not save else None
3 changes: 2 additions & 1 deletion uoftscrapers/scrapers/athletics/utsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,5 @@ def scrape(location='.', month=None, save=True):
Scraper.save_json(doc, location, id_)

Scraper.logger.info('UTSCAthletics completed.')
return athletics

return athletics if not save else None
70 changes: 4 additions & 66 deletions uoftscrapers/scrapers/calendar/utm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,76 +4,14 @@
import json
import os
import requests
import datetime


class UTMCalendar:
'''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates
'''

link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
sessionNumber = [5, 9]
@staticmethod
def scrape(location='.', year=None): #scrapes most current sessions by default

year = year or datetime.datetime.now().year
host = 'http://www.artsandscience.utoronto.ca/ofr/calendar/'

currentSession = "{0} SUMMER"
calendar = OrderedDict()
@staticmethod
def scrape(location='.'):
Scraper.logger.info('UTMCalendar initialized.')
for session in UTMCalendar.sessionNumber:
html = Scraper.get(UTMCalendar.link.format(year, session))
soup = BeautifulSoup(html, 'html.parser')
content = soup.find('div', class_='content')
dates = content.find_all('div', class_='title')
i = 0
currentDate = dates[i]
while(i<len(dates)):
date = dates[i].text
events = []
while (currentDate == dates[i]):
info = dates[i].find_next('div', class_='info')
description = info.text
eventStartEnd = date.split('-') #splits event dates over a period
eventStart = UTMCalendar.convert_date(eventStartEnd[0].strip())
if len(eventStartEnd)>1:
eventEnd = UTMCalendar.convert_date(eventStartEnd[1].strip())
else:
eventEnd = eventStart

events.append(OrderedDict([
('end_date', eventEnd),
('session', currentSession.format(UTMCalendar.get_year_from(eventStart))),
('campus', 'UTM'),
('description', description)
]))
i+=1
if(i>=len(dates)):
break;
calendar[date] = OrderedDict([
('date', eventStart),
('events', events)
])
if(i<len(dates)):
currentDate = dates[i]
currentSession = "{0} FALL/WINTER"


for date, info in calendar.items():
Scraper.save_json(info, location, UTMCalendar.convert_date(date))

Scraper.logger.info('Not implemented.')
Scraper.logger.info('UTMCalendar completed.')
return calendar

@staticmethod
def convert_date(date):
splitDate = date.split(' ')
year = splitDate[2]
day = splitDate[1].strip(',')
month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m')
return("{0}-{1}-{2}".format(year, month, day.zfill(2)))

@staticmethod
def get_year_from(date):
splitDate = date.split('-')
return splitDate[0]
33 changes: 33 additions & 0 deletions uoftscrapers/scrapers/dates/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from ..utils import Scraper
from .utsg import UTSGDates
from .utm import UTMDates

from collections import OrderedDict


class Dates:

@staticmethod
def scrape(location='.', year=None):
Scraper.logger.info('Dates initialized.')

docs = OrderedDict()

for campus in UTSGDates, UTMDates:
dates = campus.scrape(location, year=year, save=False)

if dates is None:
continue

for date, doc in dates.items():
if date not in docs:
docs[date] = OrderedDict([
('date', date),
('events', [])
])
docs[date]['events'].extend(doc['events'])

for date, doc in docs.items():
Scraper.save_json(doc, location, date)

Scraper.logger.info('Dates completed.')
79 changes: 79 additions & 0 deletions uoftscrapers/scrapers/dates/utm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from ..utils import Scraper
from bs4 import BeautifulSoup
from collections import OrderedDict
import json
import os
import requests
import datetime


class UTMDates:
'''Scraper for Important dates from UTM calendar found at https://www.utm.utoronto.ca/registrar/important-dates
'''

link = 'http://m.utm.utoronto.ca/importantDates.php?mode=full&session={0}{1}&header='
sessionNumber = [5, 9]
@staticmethod
def scrape(location='.', year=None, save=True): # scrapes most current sessions by default

year = year or datetime.datetime.now().year

currentSession = "{0} SUMMER"
calendar = OrderedDict()
Scraper.logger.info('UTMDates initialized.')
for session in UTMDates.sessionNumber:
html = Scraper.get(UTMDates.link.format(year, session))
soup = BeautifulSoup(html, 'html.parser')
content = soup.find('div', class_='content')
dates = content.find_all('div', class_='title')
i = 0
currentDate = dates[i]
while(i < len(dates)):
date = dates[i].text
events = []
while (currentDate == dates[i]):
info = dates[i].find_next('div', class_='info')
description = info.text
eventStartEnd = date.split('-') # splits event dates over a period
eventStart = UTMDates.convert_date(eventStartEnd[0].strip())
if len(eventStartEnd) > 1:
eventEnd = UTMDates.convert_date(eventStartEnd[1].strip())
else:
eventEnd = eventStart

events.append(OrderedDict([
('end_date', eventEnd),
('session', currentSession.format(UTMDates.get_year_from(eventStart))),
('campus', 'UTM'),
('description', description)
]))
i += 1
if(i >= len(dates)):
break
calendar[eventStart] = OrderedDict([
('date', eventStart),
('events', events)
])
if(i < len(dates)):
currentDate = dates[i]
currentSession = "{0} FALL/WINTER"

if save:
for date, info in calendar.items():
Scraper.save_json(info, location, UTMDates.convert_date(date))

Scraper.logger.info('UTMDates completed.')
return calendar if not save else None

@staticmethod
def convert_date(date):
splitDate = date.split(' ')
year = splitDate[2]
day = splitDate[1].strip(',')
month = datetime.datetime.strptime(splitDate[0], '%B').strftime('%m')
return("{0}-{1}-{2}".format(year, month, day.zfill(2)))

@staticmethod
def get_year_from(date):
splitDate = date.split('-')
return splitDate[0]
Loading

0 comments on commit 46a0904

Please sign in to comment.