Skip to content

Commit

Permalink
Merge pull request #74 from kshvmdn/patch-events-dates
Browse files Browse the repository at this point in the history
Patch start/end date year bug
  • Loading branch information
qasim committed Apr 28, 2016
2 parents 7383412 + 11aea3a commit b343da9
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 29 deletions.
32 changes: 16 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -648,42 +648,42 @@ https://onesearch.library.utoronto.ca/
about: String,
collection_strengths: String,
access: String,
hours: [{
sunday: [{
hours: {
sunday: {
closed: Boolean,
open: String,
close: String,
}],
monday: [{
},
monday: {
closed: Boolean,
open: Number,
close: Number,
}],
tuesday: [{
},
tuesday: {
closed: Boolean,
open: Number,
close: Number,
}],
wednesday: [{
},
wednesday: {
closed: Boolean,
open: Number,
close: Number,
}],
thursday: [{
},
thursday: {
closed: Boolean,
open: Number,
close: Number,
}],
friday: [{
},
friday: {
closed: Boolean,
open: Number,
close: Number,
}],
saturday: [{
},
saturday: {
closed: Boolean,
open: Number,
close: Number,
}]
}]
}
}
}
```
37 changes: 24 additions & 13 deletions uoftscrapers/scrapers/events/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ def scrape(location='.'):
Scraper.logger.info('Events initialized.')
Scraper.ensure_location(location)

for event_link in Events.get_events_links():
doc = Events.get_event_doc(event_link)
for event in Events.get_events_list():
doc = Events.get_event_doc(event[0], event[1])
Scraper.save_json(doc, location, doc['id'])

Scraper.logger.info('Events completed.')

@staticmethod
def get_events_links():
def get_events_list():
page_index_url = Events.host + 'index.php'
url_parts = list(urlparse.urlparse(page_index_url))
events_links = []
events_links, events_dates = [], []
paging_index = 1
events_count = 10

Expand All @@ -44,8 +44,9 @@ def get_events_links():
events_dom_arr = soup.select('#results')[0].find_all('li')
events_count = len(events_dom_arr)
events_links += list(map(lambda e: e.a['href'], events_dom_arr))
events_dates += list(map(lambda e: e.find('p').text.split(' : ')[1].split(', ')[0], events_dom_arr))

return events_links
return zip(events_links, events_dates)

@staticmethod
def convert_time(time_str):
Expand Down Expand Up @@ -79,11 +80,12 @@ def normalize_text_sections(div):
paragraph = paragraph.strip()
paragraph = paragraph.replace('\r', '')
paragraph = paragraph.replace('\n', ', ')
paragraph = paragraph.replace(' ', ' ')
paragraph = paragraph.strip()
return paragraph

@staticmethod
def get_event_doc(url_tail):
def get_event_doc(url_tail, event_date):
event_url = Events.host + url_tail
html = Scraper.get(event_url)
url_parts = list(urlparse.urlparse(event_url))
Expand All @@ -92,19 +94,28 @@ def get_event_doc(url_tail):

event_id = query['eventid']
event_title = soup.select('.eventTitle')[0].text.strip()

date_arr = event_date.split(' - ')

start_date = date_arr[0].strip()
end_date = start_date if len(date_arr) == 1 else date_arr[1].strip()

if start_date.count(' ') == 1:
# year not in start date
start_date = '%s %s' % (start_date, end_date[-4:])

start_date = datetime.strptime(start_date, '%b %d %Y')
end_date = datetime.strptime(end_date, '%b %d %Y')

event_start_date = start_date.date().isoformat()
event_end_date = end_date.date().isoformat()

raw_time = soup.select('.date')[0].text.split(',')

date_arr = raw_time[0].split(' - ')
time_arr = re.split(' - | ', raw_time[1].strip())

# Some of the strings are misformed and gives an extra empty space
time_arr = list(filter(None, time_arr))
event_start_date = datetime.strptime(date_arr[0], '%b %d')
event_start_date = event_start_date.replace(
year=date.today().year).date().isoformat()
event_end_date = datetime.strptime(date_arr[-1], '%b %d')
event_end_date = event_end_date.replace(
year=date.today().year).date().isoformat()

event_start_str = time_arr[0]
event_end_str = time_arr[-2] + time_arr[-1]
Expand Down

0 comments on commit b343da9

Please sign in to comment.