Follow redirects on HTTP requests (#361)

Lack of redirects support is notable when local PyPI server is used with fallback to official one. For example, that's how pypiserver implementation works. On the PyPI pages all the package links are relative, not absolute. What means that we cannot use original request url to construct full package url to fetch it, we should respect the url to where we were redirected. Without redirects resolution we end with the urls which our pypiserver cannot process and the HTTP 404 response obliviously will never match the md5 package digest, so we'll fail on verification stage in anyway. Fixes #200 Fixes #270
pex-tool · Feb 14, 2017 · 6ed76cb · 6ed76cb
1 parent 07e2ffb
commit 6ed76cb
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 0 deletions.
diff --git a/pex/crawler.py b/pex/crawler.py
@@ -95,6 +95,7 @@ def crawl_local(cls, link):
   @classmethod
   def crawl_remote(cls, context, link):
     try:
+      link = context.resolve(link)
       content = context.content(link)
     except context.Error as e:
       TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)

diff --git a/pex/http.py b/pex/http.py
@@ -119,6 +119,13 @@ def fetch(self, link, into=None):
     os.rename(target_tmp, target)
     return target
 
+  def resolve(self, link):
+    """Resolves final link throught all the redirections.
+
+    :param link: The :class:`Link` to open.
+    """
+    return link
+
 
 class UrllibContext(Context):
   """Default Python standard library Context."""
@@ -134,6 +141,12 @@ def content(self, link):
       encoding = message_from_string(str(fp.headers)).get_content_charset(self.DEFAULT_ENCODING)
       return fp.read().decode(encoding, 'replace')
 
+  def resolve(self, link):
+    request = urllib_request.Request(link.url)
+    request.get_method = lambda: 'HEAD'
+    with contextlib.closing(urllib_request.urlopen(request)) as response:
+      return link.wrap(response.url)
+
   def __init__(self, *args, **kw):
     TRACER.log('Warning, using a UrllibContext which is known to be flaky.')
     TRACER.log('Please build pex with the requests module for more reliable downloads.')
@@ -246,6 +259,12 @@ def content(self, link):
     with contextlib.closing(self.open(link)) as request:
       return request.read().decode(request.encoding or self.DEFAULT_ENCODING, 'replace')
 
+  def resolve(self, link):
+    return link.wrap(self._session.head(
+        link.url, verify=self._verify, allow_redirects=True,
+        headers={'User-Agent': self.USER_AGENT}
+    ).url)
+
 
 if requests:
   Context.register(RequestsContext)

diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -152,6 +152,7 @@ def test_crawler_remote():
   Crawler.reset_cache()
 
   mock_context = mock.create_autospec(Context, spec_set=True)
+  mock_context.resolve = lambda link: link
   mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')]
   expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'),
                          Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')])
@@ -164,4 +165,17 @@ def test_crawler_remote():
   assert c.crawl(test_links) == expected_output
 
 
+def test_crawler_remote_redirect():
+  Crawler.reset_cache()
+
+  mock_context = mock.create_autospec(Context, spec_set=True)
+  mock_context.resolve = lambda link: Link('http://url2.test.com')
+  mock_context.content.side_effect = [MOCK_INDEX_A]
+  expected_output = set([Link('http://url2.test.com/3to2-1.0.tar.gz')])
+
+  c = Crawler(mock_context)
+  test_links = [Link('http://url1.test.com')]
+  assert c.crawl(test_links) == expected_output
+
+
 # TODO(wickman): test page decoding via mock