From 212a4c66bda0efda86a373626011ef5bdeaeb319 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Tue, 11 Jun 2013 01:50:47 +0100
Subject: [PATCH 1/2] ENH use requests over urllib2 if available

---
 pandas/io/common.py                      | 70 +++++++++++++++++++-----
 pandas/io/data.py                        | 20 +++----
 pandas/io/html.py                        | 17 +++---
 pandas/io/tests/test_html.py             |  7 ++-
 pandas/io/tests/test_json/test_pandas.py |  4 ++
 5 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 353930482c8b8..5fdaec0f0f0d1 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -34,23 +34,25 @@ def _is_s3_url(url):
         return False
 
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
-    """ if the filepath_or_buffer is a url, translate and return the buffer
-        passthru otherwise
+    """ 
+    if the filepath_or_buffer is a url, translate and return the buffer
+    passthrough otherwise
 
-        Parameters
-        ----------
-        filepath_or_buffer : a url, filepath, or buffer
-        encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
+    Parameters
+    ----------
+    filepath_or_buffer : a url, filepath, or buffer
+    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
 
-        Returns
-        -------
-        a filepath_or_buffer, the encoding
-        
-        """
+    Returns
+    -------
+    a filepath_or_buffer, the encoding
+    
+    """
 
     if _is_url(filepath_or_buffer):
-        from urllib2 import urlopen
-        filepath_or_buffer = urlopen(filepath_or_buffer)
+
+        _, filepath_or_buffer = _req_url(filepath_or_buffer) # raise if not status_code 200?
+
         if py3compat.PY3:  # pragma: no cover
             if encoding:
                 errors = 'strict'
@@ -65,7 +67,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
     if _is_s3_url(filepath_or_buffer):
         try:
             import boto
-        except:
+        except ImportError:
             raise ImportError("boto is required to handle s3 files")
         # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
         # are environment variables
@@ -78,3 +80,43 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
         return filepath_or_buffer, None
 
     return filepath_or_buffer, None
+
+def _req_url(url):
+    '''
+    Retrieves text output of request to url
+    Raises on bad status_code or invalid urls
+    Prefer requests module if available
+
+    Parameters
+    ----------
+    url : string
+
+    Returns
+    -------
+    status_code : int, the HTTP status_code
+    buf_text : the text from the url request
+
+    '''
+    try_requests = True
+    if try_requests:
+        try:
+            import requests
+            resp = requests.get(url)
+            resp.raise_for_status()
+            buf_text = StringIO(resp.text)
+            status_code = resp.status_code
+            return status_code, buf_text
+        except (ImportError,):
+            pass
+        except (requests.exceptions.InvalidURL, 
+            requests.exceptions.InvalidSchema):
+            # responses can't deal with local files
+            pass
+
+    import urllib2
+    resp = urllib2.urlopen(url)
+    # except urllib2.URLError:  # don't think there was a purpose to this bit, raises itself
+    #    raise ValueError('Invalid URL: "{0}"'.format(url))
+    status_code = resp.code
+    buf_text = resp # if status_code == 200 else '' # If not 200 does it raise?
+    return status_code, buf_text
diff --git a/pandas/io/data.py b/pandas/io/data.py
index 8bc3df561cadb..ca4e84eacf5ae 100644
--- a/pandas/io/data.py
+++ b/pandas/io/data.py
@@ -16,7 +16,7 @@
 
 from pandas import Panel, DataFrame, Series, read_csv, concat
 from pandas.io.parsers import TextParser
-
+from pandas.io.common import _req_url
 
 def DataReader(name, data_source=None, start=None, end=None,
                retry_count=3, pause=0):
@@ -166,10 +166,9 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
         '&ignore=.csv'
 
     for _ in range(retry_count):
-        resp = urllib2.urlopen(url)
-        if resp.code == 200:
-            lines = resp.read()
-            rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
+        status_code, buf_text = _req_url(url)
+        if status_code == 200:
+            rs = read_csv(buf_text, index_col=0,
                           parse_dates=True)[::-1]
 
             # Yahoo! Finance sometimes does this awesome thing where they
@@ -206,11 +205,9 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3,
         "startdate": start.strftime('%b %d, %Y'), \
         "enddate": end.strftime('%b %d, %Y'), "output": "csv" })
     for _ in range(retry_count):
-        resp = urllib2.urlopen(url)
-        if resp.code == 200:
-            lines = resp.read()
-            rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
-                          parse_dates=True)[::-1]
+        status_code, buf_text = _req_url(url)
+        if status_code == 200:
+            rs = read_csv(buf_text, index_col=0, parse_dates=True)[::-1]
 
             return rs
 
@@ -472,8 +469,7 @@ def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
 
     fred_URL = "http://research.stlouisfed.org/fred2/series/"
 
-    url = fred_URL + '%s' % name + \
-        '/downloaddata/%s' % name + '.csv'
+    url = '%s%s/downloaddata/%s.csv' % (fred_URL, name, name)
     data = read_csv(urllib.urlopen(url), index_col=0, parse_dates=True,
                     header=None, skiprows=1, names=["DATE", name],
                     na_values='.')
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 08a9403cd18a7..a1f03888cc880 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from pandas import DataFrame, MultiIndex, isnull
-from pandas.io.common import _is_url
+from pandas.io.common import _is_url, _req_url
 
 
 try:
@@ -107,7 +107,8 @@ def _get_skiprows_iter(skiprows):
 
 
 def _read(io):
-    """Try to read from a url, file or string.
+    """
+    Try to read from a url, file or string.
 
     Parameters
     ----------
@@ -118,11 +119,8 @@ def _read(io):
     raw_text : str
     """
     if _is_url(io):
-        try:
-            with contextlib.closing(urllib2.urlopen(io)) as url:
-                raw_text = url.read()
-        except urllib2.URLError:
-            raise ValueError('Invalid URL: "{0}"'.format(io))
+        _, buf_text = _req_url(io)
+        raw_text = buf_text.read()
     elif hasattr(io, 'read'):
         raw_text = io.read()
     elif os.path.isfile(io):
@@ -720,7 +718,7 @@ def _parser_dispatch(flavor):
         if not _HAS_HTML5LIB:
             raise ImportError("html5lib not found please install it")
         if not _HAS_BS4:
-            raise ImportError("bs4 not found please install it")
+            raise ImportError("bs4 (beautifulsoup4) not found please install it")
     else:
         if not _HAS_LXML:
             raise ImportError("lxml not found please install it")
@@ -758,10 +756,9 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs):
     for flav in flavor:
         parser = _parser_dispatch(flav)
         p = parser(io, compiled_match, attrs)
-
         try:
             tables = p.parse_tables()
-        except Exception as caught:
+        except ValueError as caught:
             retained = caught
         else:
             break
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index d6086d822ee02..bdfd666d14ac6 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -263,14 +263,15 @@ def test_file_like(self):
         assert_framelist_equal(df1, df2)
 
     def test_bad_url_protocol(self):
-        self.assertRaises(ValueError, self.run_read_html, 'git://github.com',
-                          '.*Water.*')
+        from urllib2 import URLError
+        self.assertRaises(URLError, self.run_read_html,
+                                'git://github.com', '.*Water.*')
 
     @slow
     def test_file_url(self):
         url = self.banklist_data
         dfs = self.run_read_html('file://' + url, 'First',
-                                 attrs={'id': 'table'})
+                                    attrs={'id': 'table'})
         self.assertIsInstance(dfs, list)
         for df in dfs:
             self.assertIsInstance(df, DataFrame)
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
index b64bfaacd38f2..b4f11d2c7cbde 100644
--- a/pandas/io/tests/test_json/test_pandas.py
+++ b/pandas/io/tests/test_json/test_pandas.py
@@ -350,6 +350,10 @@ def test_url(self):
             
             url = 'http://search.twitter.com/search.json?q=pandas%20python'
             result = read_json(url)
+
+            # gzip compression
+            url = 'https://api.stackexchange.com/2.1/search?page=1&pagesize=10&order=desc&sort=activity&tagged=pandas&site=stackoverflow'
+            result = pd.read_json(url)
             
         except urllib2.URLError:
             raise nose.SkipTest

From 60cfa8dd307720d820c477006f7fb0cd46eb0554 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Tue, 11 Jun 2013 23:02:51 +0100
Subject: [PATCH 2/2] FIX revert html changes because of weird lxml error

---
 pandas/io/html.py            | 17 ++++++++++-------
 pandas/io/tests/test_html.py |  7 +++----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index a1f03888cc880..08a9403cd18a7 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from pandas import DataFrame, MultiIndex, isnull
-from pandas.io.common import _is_url, _req_url
+from pandas.io.common import _is_url
 
 
 try:
@@ -107,8 +107,7 @@ def _get_skiprows_iter(skiprows):
 
 
 def _read(io):
-    """
-    Try to read from a url, file or string.
+    """Try to read from a url, file or string.
 
     Parameters
     ----------
@@ -119,8 +118,11 @@ def _read(io):
     raw_text : str
     """
     if _is_url(io):
-        _, buf_text = _req_url(io)
-        raw_text = buf_text.read()
+        try:
+            with contextlib.closing(urllib2.urlopen(io)) as url:
+                raw_text = url.read()
+        except urllib2.URLError:
+            raise ValueError('Invalid URL: "{0}"'.format(io))
     elif hasattr(io, 'read'):
         raw_text = io.read()
     elif os.path.isfile(io):
@@ -718,7 +720,7 @@ def _parser_dispatch(flavor):
         if not _HAS_HTML5LIB:
             raise ImportError("html5lib not found please install it")
         if not _HAS_BS4:
-            raise ImportError("bs4 (beautifulsoup4) not found please install it")
+            raise ImportError("bs4 not found please install it")
     else:
         if not _HAS_LXML:
             raise ImportError("lxml not found please install it")
@@ -756,9 +758,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs):
     for flav in flavor:
         parser = _parser_dispatch(flav)
         p = parser(io, compiled_match, attrs)
+
         try:
             tables = p.parse_tables()
-        except ValueError as caught:
+        except Exception as caught:
             retained = caught
         else:
             break
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index bdfd666d14ac6..d6086d822ee02 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -263,15 +263,14 @@ def test_file_like(self):
         assert_framelist_equal(df1, df2)
 
     def test_bad_url_protocol(self):
-        from urllib2 import URLError
-        self.assertRaises(URLError, self.run_read_html,
-                                'git://github.com', '.*Water.*')
+        self.assertRaises(ValueError, self.run_read_html, 'git://github.com',
+                          '.*Water.*')
 
     @slow
     def test_file_url(self):
         url = self.banklist_data
         dfs = self.run_read_html('file://' + url, 'First',
-                                    attrs={'id': 'table'})
+                                 attrs={'id': 'table'})
         self.assertIsInstance(dfs, list)
         for df in dfs:
             self.assertIsInstance(df, DataFrame)