Skip to content

Commit

Permalink
fix: handle timeouts and downtimes
Browse files Browse the repository at this point in the history
  • Loading branch information
user2589 committed Jan 25, 2019
1 parent d5da256 commit b3839ff
Showing 1 changed file with 27 additions and 6 deletions.
33 changes: 27 additions & 6 deletions stgithub.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,12 @@
}


class GitHubScrapingError(requests.HTTPError):
pass


def normalize_text(string):
# type: (str) -> str
# type: (six.string_types) -> six.string_types
""" Normalize spaces and newlines
>>> normalize_text("\\nHello world \\t\\n!")
'Hello world!'
Expand All @@ -77,7 +81,7 @@ def normalize_text(string):


def extract_repo(link):
# type: (str) -> str
# type: (six.string_types) -> six.string_types
""" Extract repository slug from a GitHub link
>>> extract_repo("/org/repo/blabla?something=foo")
Expand Down Expand Up @@ -315,6 +319,7 @@ class Scraper(object):
# after many experiments, 40/121 looks to be the fastest option
queue_max_size = 40
queue_time_length = 121
retries_on_timeout = 5

def __new__(cls, *args, **kwargs): # Singleton
if not isinstance(cls._instance, cls):
Expand All @@ -341,8 +346,24 @@ def _request(self, url, params=None, headers=None):
time.sleep(sleep_interval)

self.queue.put(time.time())
r = requests.get(
url, cookies=self.cookies, headers=headers, params=params)

# handle network errors and GitHub downtimes
r = None
for _ in range(self.retries_on_timeout):
try:
r = requests.get(url, cookies=self.cookies,
headers=headers, params=params)
except requests.exceptions.RequestException:
continue
if r.status_code < 500:
break
else:
r = None

if r is None:
raise GitHubScrapingError(
"GitHub is not responding to requests. Try again later.")

if r.status_code == 429:
logging.info("Hit GitHub XHR rate limit, retry in 10 seconds..")
time.sleep(10)
Expand All @@ -366,8 +387,8 @@ def project_contributor_stats(self, repo_slug):
of lines added, changed or deleted. Note that weeks are
started on Sunday and represented by a Unix timestamp.
>>> Scraper().project_contributor_stats('pandas-dev/pandas')
[{u'author': {u'avatar': u'https://avatars0.githubusercontent.com/u/1435085?s=60&v=4',
>>> Scraper().project_contributor_stats('pandas-dev/pandas') # doctest: +SKIP
[{u'author': {u'avatar': u'https://avatars0.githubusercontent.com/...',
u'hovercard_url': u'/hovercards?user_id=1435085',
u'id': 1435085,
u'login': u'blbradley',
Expand Down

0 comments on commit b3839ff

Please sign in to comment.