From 84cadfce59a05e47edcc7dba649e5c57b9f9143d Mon Sep 17 00:00:00 2001 From: Kashav Madan Date: Thu, 28 Apr 2016 20:34:20 -0400 Subject: [PATCH] Patch hang on book retrieve --- uoftscrapers/scrapers/textbooks/__init__.py | 28 +++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/uoftscrapers/scrapers/textbooks/__init__.py b/uoftscrapers/scrapers/textbooks/__init__.py index 71ad746..d8aaaad 100644 --- a/uoftscrapers/scrapers/textbooks/__init__.py +++ b/uoftscrapers/scrapers/textbooks/__init__.py @@ -104,6 +104,9 @@ def scrape(location='.'): def retrieve_terms(): html = Scraper.get('%s/buy_courselisting.asp' % Textbooks.host) + if html is None: + return [] + listing = BeautifulSoup(html, "html.parser") terms = listing.find(id='fTerm').find_all('option')[1:] @@ -137,7 +140,10 @@ def retrieve_departments(terms): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers, max_attempts=3) + + if xml is None: + continue departments = BeautifulSoup(xml, "xml").find_all('department') for department in departments: @@ -168,7 +174,10 @@ def retrieve_courses(department): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers, max_attempts=3) + + if xml is None: + return [] courses = BeautifulSoup(xml, "xml").find_all('course') for course in courses: @@ -196,7 +205,10 @@ def retrieve_sections(course): } xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + params=payload, headers=headers, max_attempts=3) + + if xml is None: + return [] sections = BeautifulSoup(xml, "xml").find_all('section') for section in sections: @@ -223,14 +235,16 @@ def retrieve_books(section): 'Referer': '%s/buy_courselisting.asp' % Textbooks.host } - xml = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, - params=payload, headers=headers) + html = Scraper.get('%s/textbooks_xml.asp' % Textbooks.host, + params=payload, headers=headers, max_attempts=3) + + if html is None: + return [] - soup = BeautifulSoup(xml, "html.parser") + soup = BeautifulSoup(html, "html.parser") books = soup.find_all('tr', {'class': 'book'}) if books == None: - done += 1 return [] for book in books: