Skip to content

Commit

Permalink
Follow redirects on HTTP requests (#361)
Browse files Browse the repository at this point in the history
Lack of redirects support is notable when local PyPI server is used
with fallback to official one. For example, that's how pypiserver
implementation works.

On the PyPI pages all the package links are relative, not absolute.
What means that we cannot use original request url to construct full
package url to fetch it, we should respect the url to where we were
redirected.

Without redirects resolution we end with the urls which our pypiserver
cannot process and the HTTP 404 response obliviously will never match
the md5 package digest, so we'll fail on verification stage in anyway.

Fixes #200
Fixes #270
  • Loading branch information
kxepal authored and kwlzn committed Feb 14, 2017
1 parent 07e2ffb commit 6ed76cb
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 0 deletions.
1 change: 1 addition & 0 deletions pex/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def crawl_local(cls, link):
@classmethod
def crawl_remote(cls, context, link):
try:
link = context.resolve(link)
content = context.content(link)
except context.Error as e:
TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
Expand Down
19 changes: 19 additions & 0 deletions pex/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ def fetch(self, link, into=None):
os.rename(target_tmp, target)
return target

def resolve(self, link):
"""Resolves final link throught all the redirections.
:param link: The :class:`Link` to open.
"""
return link


class UrllibContext(Context):
"""Default Python standard library Context."""
Expand All @@ -134,6 +141,12 @@ def content(self, link):
encoding = message_from_string(str(fp.headers)).get_content_charset(self.DEFAULT_ENCODING)
return fp.read().decode(encoding, 'replace')

def resolve(self, link):
request = urllib_request.Request(link.url)
request.get_method = lambda: 'HEAD'
with contextlib.closing(urllib_request.urlopen(request)) as response:
return link.wrap(response.url)

def __init__(self, *args, **kw):
TRACER.log('Warning, using a UrllibContext which is known to be flaky.')
TRACER.log('Please build pex with the requests module for more reliable downloads.')
Expand Down Expand Up @@ -246,6 +259,12 @@ def content(self, link):
with contextlib.closing(self.open(link)) as request:
return request.read().decode(request.encoding or self.DEFAULT_ENCODING, 'replace')

def resolve(self, link):
return link.wrap(self._session.head(
link.url, verify=self._verify, allow_redirects=True,
headers={'User-Agent': self.USER_AGENT}
).url)


if requests:
Context.register(RequestsContext)
Expand Down
14 changes: 14 additions & 0 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def test_crawler_remote():
Crawler.reset_cache()

mock_context = mock.create_autospec(Context, spec_set=True)
mock_context.resolve = lambda link: link
mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')]
expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'),
Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')])
Expand All @@ -164,4 +165,17 @@ def test_crawler_remote():
assert c.crawl(test_links) == expected_output


def test_crawler_remote_redirect():
Crawler.reset_cache()

mock_context = mock.create_autospec(Context, spec_set=True)
mock_context.resolve = lambda link: Link('http://url2.test.com')
mock_context.content.side_effect = [MOCK_INDEX_A]
expected_output = set([Link('http://url2.test.com/3to2-1.0.tar.gz')])

c = Crawler(mock_context)
test_links = [Link('http://url1.test.com')]
assert c.crawl(test_links) == expected_output


# TODO(wickman): test page decoding via mock

0 comments on commit 6ed76cb

Please sign in to comment.