Skip to content

Commit

Permalink
Merge branch 'master' of github.com:cobalt-uoft/uoft-scrapers
Browse files Browse the repository at this point in the history
  • Loading branch information
qasim committed Apr 29, 2016
2 parents 3d83733 + d05c150 commit 092e756
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 36 deletions.
32 changes: 16 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -648,42 +648,42 @@ https://onesearch.library.utoronto.ca/
about: String,
collection_strengths: String,
access: String,
hours: [{
sunday: [{
hours: {
sunday: {
closed: Boolean,
open: String,
close: String,
}],
monday: [{
},
monday: {
closed: Boolean,
open: Number,
close: Number,
}],
tuesday: [{
},
tuesday: {
closed: Boolean,
open: Number,
close: Number,
}],
wednesday: [{
},
wednesday: {
closed: Boolean,
open: Number,
close: Number,
}],
thursday: [{
},
thursday: {
closed: Boolean,
open: Number,
close: Number,
}],
friday: [{
},
friday: {
closed: Boolean,
open: Number,
close: Number,
}],
saturday: [{
},
saturday: {
closed: Boolean,
open: Number,
close: Number,
}]
}]
}
}
}
```
37 changes: 24 additions & 13 deletions uoftscrapers/scrapers/events/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ def scrape(location='.'):
Scraper.logger.info('Events initialized.')
Scraper.ensure_location(location)

for event_link in Events.get_events_links():
doc = Events.get_event_doc(event_link)
for event in Events.get_events_list():
doc = Events.get_event_doc(event[0], event[1])
Scraper.save_json(doc, location, doc['id'])

Scraper.logger.info('Events completed.')

@staticmethod
def get_events_links():
def get_events_list():
page_index_url = Events.host + 'index.php'
url_parts = list(urlparse.urlparse(page_index_url))
events_links = []
events_links, events_dates = [], []
paging_index = 1
events_count = 10

Expand All @@ -44,8 +44,9 @@ def get_events_links():
events_dom_arr = soup.select('#results')[0].find_all('li')
events_count = len(events_dom_arr)
events_links += list(map(lambda e: e.a['href'], events_dom_arr))
events_dates += list(map(lambda e: e.find('p').text.split(' : ')[1].split(', ')[0], events_dom_arr))

return events_links
return zip(events_links, events_dates)

@staticmethod
def convert_time(time_str):
Expand Down Expand Up @@ -79,11 +80,12 @@ def normalize_text_sections(div):
paragraph = paragraph.strip()
paragraph = paragraph.replace('\r', '')
paragraph = paragraph.replace('\n', ', ')
paragraph = paragraph.replace(' ', ' ')
paragraph = paragraph.strip()
return paragraph

@staticmethod
def get_event_doc(url_tail):
def get_event_doc(url_tail, event_date):
event_url = Events.host + url_tail
html = Scraper.get(event_url)
url_parts = list(urlparse.urlparse(event_url))
Expand All @@ -92,19 +94,28 @@ def get_event_doc(url_tail):

event_id = query['eventid']
event_title = soup.select('.eventTitle')[0].text.strip()

date_arr = event_date.split(' - ')

start_date = date_arr[0].strip()
end_date = start_date if len(date_arr) == 1 else date_arr[1].strip()

if start_date.count(' ') == 1:
# year not in start date
start_date = '%s %s' % (start_date, end_date[-4:])

start_date = datetime.strptime(start_date, '%b %d %Y')
end_date = datetime.strptime(end_date, '%b %d %Y')

event_start_date = start_date.date().isoformat()
event_end_date = end_date.date().isoformat()

raw_time = soup.select('.date')[0].text.split(',')

date_arr = raw_time[0].split(' - ')
time_arr = re.split(' - | ', raw_time[1].strip())

# Some of the strings are misformed and gives an extra empty space
time_arr = list(filter(None, time_arr))
event_start_date = datetime.strptime(date_arr[0], '%b %d')
event_start_date = event_start_date.replace(
year=date.today().year).date().isoformat()
event_end_date = datetime.strptime(date_arr[-1], '%b %d')
event_end_date = event_end_date.replace(
year=date.today().year).date().isoformat()

event_start_str = time_arr[0]
event_end_str = time_arr[-2] + time_arr[-1]
Expand Down
28 changes: 21 additions & 7 deletions uoftscrapers/scrapers/textbooks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ def scrape(location='.'):
def retrieve_terms():
html = Scraper.get('%s/buy_courselisting.asp' % Textbooks.host)

if html is None:
return []

listing = BeautifulSoup(html, "html.parser")
terms = listing.find(id='fTerm').find_all('option')[1:]

Expand Down Expand Up @@ -137,7 +140,10 @@ def retrieve_departments(terms):
}

xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)
params=payload, headers=headers, max_attempts=3)

if xml is None:
continue

departments = BeautifulSoup(xml, "xml").find_all('department')
for department in departments:
Expand Down Expand Up @@ -168,7 +174,10 @@ def retrieve_courses(department):
}

xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)
params=payload, headers=headers, max_attempts=3)

if xml is None:
return []

courses = BeautifulSoup(xml, "xml").find_all('course')
for course in courses:
Expand Down Expand Up @@ -196,7 +205,10 @@ def retrieve_sections(course):
}

xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)
params=payload, headers=headers, max_attempts=3)

if xml is None:
return []

sections = BeautifulSoup(xml, "xml").find_all('section')
for section in sections:
Expand All @@ -223,14 +235,16 @@ def retrieve_books(section):
'Referer': '%s/buy_courselisting.asp' % Textbooks.host
}

xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers)
html = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host,
params=payload, headers=headers, max_attempts=3)

if html is None:
return []

soup = BeautifulSoup(xml, "html.parser")
soup = BeautifulSoup(html, "html.parser")
books = soup.find_all('tr', {'class': 'book'})

if books == None:
done += 1
return []

for book in books:
Expand Down

0 comments on commit 092e756

Please sign in to comment.