From 97691b41a010931f951bb046f5c435f6cb77d2ab Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Thu, 28 Apr 2016 02:51:26 -0400 Subject: [PATCH 1/3] Patch start/end year issue Get start/end date (w/ year) from index when fetching links --- uoftscrapers/scrapers/events/__init__.py | 42 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index d1bd180..6bb3257 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -19,17 +19,17 @@ def scrape(location='.'): Scraper.logger.info('Events initialized.') Scraper.ensure_location(location) - for event_link in Events.get_events_links(): - doc = Events.get_event_doc(event_link) + for event in Events.get_events_list(): + doc = Events.get_event_doc(event['link'], event['date']) Scraper.save_json(doc, location, doc['id']) Scraper.logger.info('Events completed.') @staticmethod - def get_events_links(): + def get_events_list(): page_index_url = Events.host + 'index.php' url_parts = list(urlparse.urlparse(page_index_url)) - events_links = [] + events_links, events_dates = [], [] paging_index = 1 events_count = 10 @@ -44,8 +44,15 @@ def get_events_links(): events_dom_arr = soup.select('#results')[0].find_all('li') events_count = len(events_dom_arr) events_links += list(map(lambda e: e.a['href'], events_dom_arr)) + events_dates += list(map(lambda e: e.find('p').text.split(' : ')[1].split(', ')[0], events_dom_arr)) - return events_links + events_info = [] + for i in range(len(events_links)): + events_info.append({ + 'link': events_links[i], + 'date': events_dates[i] + }) + return events_info @staticmethod def convert_time(time_str): @@ -79,11 +86,12 @@ def normalize_text_sections(div): paragraph = paragraph.strip() paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\n', ', ') + paragraph = paragraph.replace(' ', ' ') paragraph = paragraph.strip() return paragraph @staticmethod - def get_event_doc(url_tail): + def get_event_doc(url_tail, event_date): event_url = Events.host + url_tail html = Scraper.get(event_url) url_parts = list(urlparse.urlparse(event_url)) @@ -92,19 +100,27 @@ def get_event_doc(url_tail): event_id = query['eventid'] event_title = soup.select('.eventTitle')[0].text.strip() + + date_arr = event_date.split(' - ') + + start_date = date_arr[0].strip() + end_date = start_date if len(date_arr) == 1 else date_arr[1].strip() + + if start_date.count(' ') == 1: + # year not in start date + start_date = '%s %s' % (start_date, end_date.split(' ')[2]) + + event_start_date = datetime.strptime(start_date, + '%b %d %Y').date().isoformat() + event_end_date = datetime.strptime(end_date, + '%b %d %Y').date().isoformat() + raw_time = soup.select('.date')[0].text.split(',') - date_arr = raw_time[0].split(' - ') time_arr = re.split(' - | ', raw_time[1].strip()) # Some of the strings are misformed and gives an extra empty space time_arr = list(filter(None, time_arr)) - event_start_date = datetime.strptime(date_arr[0], '%b %d') - event_start_date = event_start_date.replace( - year=date.today().year).date().isoformat() - event_end_date = datetime.strptime(date_arr[-1], '%b %d') - event_end_date = event_end_date.replace( - year=date.today().year).date().isoformat() event_start_str = time_arr[0] event_end_str = time_arr[-2] + time_arr[-1] From 38f821bc364d7aaa0728c2d86d419d1ffafa8ae5 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Thu, 28 Apr 2016 03:00:57 -0400 Subject: [PATCH 2/3] Clean up --- uoftscrapers/scrapers/events/__init__.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/uoftscrapers/scrapers/events/__init__.py b/uoftscrapers/scrapers/events/__init__.py index 6bb3257..61cc510 100644 --- a/uoftscrapers/scrapers/events/__init__.py +++ b/uoftscrapers/scrapers/events/__init__.py @@ -20,7 +20,7 @@ def scrape(location='.'): Scraper.ensure_location(location) for event in Events.get_events_list(): - doc = Events.get_event_doc(event['link'], event['date']) + doc = Events.get_event_doc(event[0], event[1]) Scraper.save_json(doc, location, doc['id']) Scraper.logger.info('Events completed.') @@ -46,13 +46,7 @@ def get_events_list(): events_links += list(map(lambda e: e.a['href'], events_dom_arr)) events_dates += list(map(lambda e: e.find('p').text.split(' : ')[1].split(', ')[0], events_dom_arr)) - events_info = [] - for i in range(len(events_links)): - events_info.append({ - 'link': events_links[i], - 'date': events_dates[i] - }) - return events_info + return zip(events_links, events_dates) @staticmethod def convert_time(time_str): @@ -108,12 +102,13 @@ def get_event_doc(url_tail, event_date): if start_date.count(' ') == 1: # year not in start date - start_date = '%s %s' % (start_date, end_date.split(' ')[2]) + start_date = '%s %s' % (start_date, end_date[-4:]) - event_start_date = datetime.strptime(start_date, - '%b %d %Y').date().isoformat() - event_end_date = datetime.strptime(end_date, - '%b %d %Y').date().isoformat() + start_date = datetime.strptime(start_date, '%b %d %Y') + end_date = datetime.strptime(end_date, '%b %d %Y') + + event_start_date = start_date.date().isoformat() + event_end_date = end_date.date().isoformat() raw_time = soup.select('.date')[0].text.split(',') From 11aea3a15943680b1106edd1980d3c3c7643a118 Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Thu, 28 Apr 2016 03:20:10 -0400 Subject: [PATCH 3/3] Update Libraries schema --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 5593766..e3b89f6 100644 --- a/README.md +++ b/README.md @@ -648,42 +648,42 @@ https://onesearch.library.utoronto.ca/ about: String, collection_strengths: String, access: String, - hours: [{ - sunday: [{ + hours: { + sunday: { closed: Boolean, open: String, close: String, - }], - monday: [{ + }, + monday: { closed: Boolean, open: Number, close: Number, - }], - tuesday: [{ + }, + tuesday: { closed: Boolean, open: Number, close: Number, - }], - wednesday: [{ + }, + wednesday: { closed: Boolean, open: Number, close: Number, - }], - thursday: [{ + }, + thursday: { closed: Boolean, open: Number, close: Number, - }], - friday: [{ + }, + friday: { closed: Boolean, open: Number, close: Number, - }], - saturday: [{ + }, + saturday: { closed: Boolean, open: Number, close: Number, - }] - }] + } + } } ```