diff --git a/pandas/io/common.py b/pandas/io/common.py index 353930482c8b8..5fdaec0f0f0d1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -34,23 +34,25 @@ def _is_s3_url(url): return False def get_filepath_or_buffer(filepath_or_buffer, encoding=None): - """ if the filepath_or_buffer is a url, translate and return the buffer - passthru otherwise + """ + if the filepath_or_buffer is a url, translate and return the buffer + passthrough otherwise - Parameters - ---------- - filepath_or_buffer : a url, filepath, or buffer - encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + Parameters + ---------- + filepath_or_buffer : a url, filepath, or buffer + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' - Returns - ------- - a filepath_or_buffer, the encoding - - """ + Returns + ------- + a filepath_or_buffer, the encoding + + """ if _is_url(filepath_or_buffer): - from urllib2 import urlopen - filepath_or_buffer = urlopen(filepath_or_buffer) + + _, filepath_or_buffer = _req_url(filepath_or_buffer) # raise if not status_code 200? + if py3compat.PY3: # pragma: no cover if encoding: errors = 'strict' @@ -65,7 +67,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): if _is_s3_url(filepath_or_buffer): try: import boto - except: + except ImportError: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables @@ -78,3 +80,43 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): return filepath_or_buffer, None return filepath_or_buffer, None + +def _req_url(url): + ''' + Retrieves text output of request to url + Raises on bad status_code or invalid urls + Prefer requests module if available + + Parameters + ---------- + url : string + + Returns + ------- + status_code : int, the HTTP status_code + buf_text : the text from the url request + + ''' + try_requests = True + if try_requests: + try: + import requests + resp = requests.get(url) + resp.raise_for_status() + buf_text = StringIO(resp.text) + status_code = resp.status_code + return status_code, buf_text + except (ImportError,): + pass + except (requests.exceptions.InvalidURL, + requests.exceptions.InvalidSchema): + # responses can't deal with local files + pass + + import urllib2 + resp = urllib2.urlopen(url) + # except urllib2.URLError: # don't think there was a purpose to this bit, raises itself + # raise ValueError('Invalid URL: "{0}"'.format(url)) + status_code = resp.code + buf_text = resp # if status_code == 200 else '' # If not 200 does it raise? + return status_code, buf_text diff --git a/pandas/io/data.py b/pandas/io/data.py index 8bc3df561cadb..ca4e84eacf5ae 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -16,7 +16,7 @@ from pandas import Panel, DataFrame, Series, read_csv, concat from pandas.io.parsers import TextParser - +from pandas.io.common import _req_url def DataReader(name, data_source=None, start=None, end=None, retry_count=3, pause=0): @@ -166,10 +166,9 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, '&ignore=.csv' for _ in range(retry_count): - resp = urllib2.urlopen(url) - if resp.code == 200: - lines = resp.read() - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + status_code, buf_text = _req_url(url) + if status_code == 200: + rs = read_csv(buf_text, index_col=0, parse_dates=True)[::-1] # Yahoo! Finance sometimes does this awesome thing where they @@ -206,11 +205,9 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3, "startdate": start.strftime('%b %d, %Y'), \ "enddate": end.strftime('%b %d, %Y'), "output": "csv" }) for _ in range(retry_count): - resp = urllib2.urlopen(url) - if resp.code == 200: - lines = resp.read() - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, - parse_dates=True)[::-1] + status_code, buf_text = _req_url(url) + if status_code == 200: + rs = read_csv(buf_text, index_col=0, parse_dates=True)[::-1] return rs @@ -472,8 +469,7 @@ def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), fred_URL = "http://research.stlouisfed.org/fred2/series/" - url = fred_URL + '%s' % name + \ - '/downloaddata/%s' % name + '.csv' + url = '%s%s/downloaddata/%s.csv' % (fred_URL, name, name) data = read_csv(urllib.urlopen(url), index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values='.') diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index b64bfaacd38f2..b4f11d2c7cbde 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -350,6 +350,10 @@ def test_url(self): url = 'http://search.twitter.com/search.json?q=pandas%20python' result = read_json(url) + + # gzip compression + url = 'https://api.stackexchange.com/2.1/search?page=1&pagesize=10&order=desc&sort=activity&tagged=pandas&site=stackoverflow' + result = pd.read_json(url) except urllib2.URLError: raise nose.SkipTest