diff --git a/pex/crawler.py b/pex/crawler.py index ee89d7fee..e0be8f5cb 100644 --- a/pex/crawler.py +++ b/pex/crawler.py @@ -95,6 +95,7 @@ def crawl_local(cls, link): @classmethod def crawl_remote(cls, context, link): try: + link = context.resolve(link) content = context.content(link) except context.Error as e: TRACER.log('Failed to read %s: %s' % (link.url, e), V=1) diff --git a/pex/http.py b/pex/http.py index 6396ff8c3..a1e68419f 100644 --- a/pex/http.py +++ b/pex/http.py @@ -119,6 +119,13 @@ def fetch(self, link, into=None): os.rename(target_tmp, target) return target + def resolve(self, link): + """Resolves final link throught all the redirections. + + :param link: The :class:`Link` to open. + """ + return link + class UrllibContext(Context): """Default Python standard library Context.""" @@ -134,6 +141,12 @@ def content(self, link): encoding = message_from_string(str(fp.headers)).get_content_charset(self.DEFAULT_ENCODING) return fp.read().decode(encoding, 'replace') + def resolve(self, link): + request = urllib_request.Request(link.url) + request.get_method = lambda: 'HEAD' + with contextlib.closing(urllib_request.urlopen(request)) as response: + return link.wrap(response.url) + def __init__(self, *args, **kw): TRACER.log('Warning, using a UrllibContext which is known to be flaky.') TRACER.log('Please build pex with the requests module for more reliable downloads.') @@ -246,6 +259,12 @@ def content(self, link): with contextlib.closing(self.open(link)) as request: return request.read().decode(request.encoding or self.DEFAULT_ENCODING, 'replace') + def resolve(self, link): + return link.wrap(self._session.head( + link.url, verify=self._verify, allow_redirects=True, + headers={'User-Agent': self.USER_AGENT} + ).url) + if requests: Context.register(RequestsContext) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 3fc44ebec..7e7fcbebb 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -152,6 +152,7 @@ def test_crawler_remote(): Crawler.reset_cache() mock_context = mock.create_autospec(Context, spec_set=True) + mock_context.resolve = lambda link: link mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')] expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'), Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')]) @@ -164,4 +165,17 @@ def test_crawler_remote(): assert c.crawl(test_links) == expected_output +def test_crawler_remote_redirect(): + Crawler.reset_cache() + + mock_context = mock.create_autospec(Context, spec_set=True) + mock_context.resolve = lambda link: Link('http://url2.test.com') + mock_context.content.side_effect = [MOCK_INDEX_A] + expected_output = set([Link('http://url2.test.com/3to2-1.0.tar.gz')]) + + c = Crawler(mock_context) + test_links = [Link('http://url1.test.com')] + assert c.crawl(test_links) == expected_output + + # TODO(wickman): test page decoding via mock