From 212a4c66bda0efda86a373626011ef5bdeaeb319 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 11 Jun 2013 01:50:47 +0100 Subject: [PATCH 1/2] ENH use requests over urllib2 if available --- pandas/io/common.py | 70 +++++++++++++++++++----- pandas/io/data.py | 20 +++---- pandas/io/html.py | 17 +++--- pandas/io/tests/test_html.py | 7 ++- pandas/io/tests/test_json/test_pandas.py | 4 ++ 5 files changed, 79 insertions(+), 39 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 353930482c8b8..5fdaec0f0f0d1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -34,23 +34,25 @@ def _is_s3_url(url): return False def get_filepath_or_buffer(filepath_or_buffer, encoding=None): - """ if the filepath_or_buffer is a url, translate and return the buffer - passthru otherwise + """ + if the filepath_or_buffer is a url, translate and return the buffer + passthrough otherwise - Parameters - ---------- - filepath_or_buffer : a url, filepath, or buffer - encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + Parameters + ---------- + filepath_or_buffer : a url, filepath, or buffer + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' - Returns - ------- - a filepath_or_buffer, the encoding - - """ + Returns + ------- + a filepath_or_buffer, the encoding + + """ if _is_url(filepath_or_buffer): - from urllib2 import urlopen - filepath_or_buffer = urlopen(filepath_or_buffer) + + _, filepath_or_buffer = _req_url(filepath_or_buffer) # raise if not status_code 200? + if py3compat.PY3: # pragma: no cover if encoding: errors = 'strict' @@ -65,7 +67,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): if _is_s3_url(filepath_or_buffer): try: import boto - except: + except ImportError: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables @@ -78,3 +80,43 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): return filepath_or_buffer, None return filepath_or_buffer, None + +def _req_url(url): + ''' + Retrieves text output of request to url + Raises on bad status_code or invalid urls + Prefer requests module if available + + Parameters + ---------- + url : string + + Returns + ------- + status_code : int, the HTTP status_code + buf_text : the text from the url request + + ''' + try_requests = True + if try_requests: + try: + import requests + resp = requests.get(url) + resp.raise_for_status() + buf_text = StringIO(resp.text) + status_code = resp.status_code + return status_code, buf_text + except (ImportError,): + pass + except (requests.exceptions.InvalidURL, + requests.exceptions.InvalidSchema): + # responses can't deal with local files + pass + + import urllib2 + resp = urllib2.urlopen(url) + # except urllib2.URLError: # don't think there was a purpose to this bit, raises itself + # raise ValueError('Invalid URL: "{0}"'.format(url)) + status_code = resp.code + buf_text = resp # if status_code == 200 else '' # If not 200 does it raise? + return status_code, buf_text diff --git a/pandas/io/data.py b/pandas/io/data.py index 8bc3df561cadb..ca4e84eacf5ae 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -16,7 +16,7 @@ from pandas import Panel, DataFrame, Series, read_csv, concat from pandas.io.parsers import TextParser - +from pandas.io.common import _req_url def DataReader(name, data_source=None, start=None, end=None, retry_count=3, pause=0): @@ -166,10 +166,9 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, '&ignore=.csv' for _ in range(retry_count): - resp = urllib2.urlopen(url) - if resp.code == 200: - lines = resp.read() - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + status_code, buf_text = _req_url(url) + if status_code == 200: + rs = read_csv(buf_text, index_col=0, parse_dates=True)[::-1] # Yahoo! Finance sometimes does this awesome thing where they @@ -206,11 +205,9 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3, "startdate": start.strftime('%b %d, %Y'), \ "enddate": end.strftime('%b %d, %Y'), "output": "csv" }) for _ in range(retry_count): - resp = urllib2.urlopen(url) - if resp.code == 200: - lines = resp.read() - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, - parse_dates=True)[::-1] + status_code, buf_text = _req_url(url) + if status_code == 200: + rs = read_csv(buf_text, index_col=0, parse_dates=True)[::-1] return rs @@ -472,8 +469,7 @@ def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), fred_URL = "http://research.stlouisfed.org/fred2/series/" - url = fred_URL + '%s' % name + \ - '/downloaddata/%s' % name + '.csv' + url = '%s%s/downloaddata/%s.csv' % (fred_URL, name, name) data = read_csv(urllib.urlopen(url), index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values='.') diff --git a/pandas/io/html.py b/pandas/io/html.py index 08a9403cd18a7..a1f03888cc880 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -20,7 +20,7 @@ import numpy as np from pandas import DataFrame, MultiIndex, isnull -from pandas.io.common import _is_url +from pandas.io.common import _is_url, _req_url try: @@ -107,7 +107,8 @@ def _get_skiprows_iter(skiprows): def _read(io): - """Try to read from a url, file or string. + """ + Try to read from a url, file or string. Parameters ---------- @@ -118,11 +119,8 @@ def _read(io): raw_text : str """ if _is_url(io): - try: - with contextlib.closing(urllib2.urlopen(io)) as url: - raw_text = url.read() - except urllib2.URLError: - raise ValueError('Invalid URL: "{0}"'.format(io)) + _, buf_text = _req_url(io) + raw_text = buf_text.read() elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): @@ -720,7 +718,7 @@ def _parser_dispatch(flavor): if not _HAS_HTML5LIB: raise ImportError("html5lib not found please install it") if not _HAS_BS4: - raise ImportError("bs4 not found please install it") + raise ImportError("bs4 (beautifulsoup4) not found please install it") else: if not _HAS_LXML: raise ImportError("lxml not found please install it") @@ -758,10 +756,9 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs): for flav in flavor: parser = _parser_dispatch(flav) p = parser(io, compiled_match, attrs) - try: tables = p.parse_tables() - except Exception as caught: + except ValueError as caught: retained = caught else: break diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index d6086d822ee02..bdfd666d14ac6 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -263,14 +263,15 @@ def test_file_like(self): assert_framelist_equal(df1, df2) def test_bad_url_protocol(self): - self.assertRaises(ValueError, self.run_read_html, 'git://github.com', - '.*Water.*') + from urllib2 import URLError + self.assertRaises(URLError, self.run_read_html, + 'git://github.com', '.*Water.*') @slow def test_file_url(self): url = self.banklist_data dfs = self.run_read_html('file://' + url, 'First', - attrs={'id': 'table'}) + attrs={'id': 'table'}) self.assertIsInstance(dfs, list) for df in dfs: self.assertIsInstance(df, DataFrame) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index b64bfaacd38f2..b4f11d2c7cbde 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -350,6 +350,10 @@ def test_url(self): url = 'http://search.twitter.com/search.json?q=pandas%20python' result = read_json(url) + + # gzip compression + url = 'https://api.stackexchange.com/2.1/search?page=1&pagesize=10&order=desc&sort=activity&tagged=pandas&site=stackoverflow' + result = pd.read_json(url) except urllib2.URLError: raise nose.SkipTest From 60cfa8dd307720d820c477006f7fb0cd46eb0554 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 11 Jun 2013 23:02:51 +0100 Subject: [PATCH 2/2] FIX revert html changes because of weird lxml error --- pandas/io/html.py | 17 ++++++++++------- pandas/io/tests/test_html.py | 7 +++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index a1f03888cc880..08a9403cd18a7 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -20,7 +20,7 @@ import numpy as np from pandas import DataFrame, MultiIndex, isnull -from pandas.io.common import _is_url, _req_url +from pandas.io.common import _is_url try: @@ -107,8 +107,7 @@ def _get_skiprows_iter(skiprows): def _read(io): - """ - Try to read from a url, file or string. + """Try to read from a url, file or string. Parameters ---------- @@ -119,8 +118,11 @@ def _read(io): raw_text : str """ if _is_url(io): - _, buf_text = _req_url(io) - raw_text = buf_text.read() + try: + with contextlib.closing(urllib2.urlopen(io)) as url: + raw_text = url.read() + except urllib2.URLError: + raise ValueError('Invalid URL: "{0}"'.format(io)) elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): @@ -718,7 +720,7 @@ def _parser_dispatch(flavor): if not _HAS_HTML5LIB: raise ImportError("html5lib not found please install it") if not _HAS_BS4: - raise ImportError("bs4 (beautifulsoup4) not found please install it") + raise ImportError("bs4 not found please install it") else: if not _HAS_LXML: raise ImportError("lxml not found please install it") @@ -756,9 +758,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs): for flav in flavor: parser = _parser_dispatch(flav) p = parser(io, compiled_match, attrs) + try: tables = p.parse_tables() - except ValueError as caught: + except Exception as caught: retained = caught else: break diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index bdfd666d14ac6..d6086d822ee02 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -263,15 +263,14 @@ def test_file_like(self): assert_framelist_equal(df1, df2) def test_bad_url_protocol(self): - from urllib2 import URLError - self.assertRaises(URLError, self.run_read_html, - 'git://github.com', '.*Water.*') + self.assertRaises(ValueError, self.run_read_html, 'git://github.com', + '.*Water.*') @slow def test_file_url(self): url = self.banklist_data dfs = self.run_read_html('file://' + url, 'First', - attrs={'id': 'table'}) + attrs={'id': 'table'}) self.assertIsInstance(dfs, list) for df in dfs: self.assertIsInstance(df, DataFrame)