BUG: Deprecate nthreads argument (pandas-dev#23112)

The nthreads argument is no longer supported since pyarrow 0.11.0 and was replaced with use_threads. Hence we deprecate the argument now as well so we can remove it in the future. This commit also: - removes feather-format as a dependency and replaces it with usage of pyarrow directly. - sets CI dependencies to respect the changes above. We test backwards compatibility with pyarrow 0.9.0 as conda does not provide a pyarrow 0.10.0 and the conda-forge version has comatibility issues with the rest of the installed packages. Resolves pandas-dev#23053. Resolves pandas-dev#21639.
Pingviinituutti · Feb 28, 2019 · 3fa298c · 3fa298c
1 parent 02d457c
commit 3fa298c
Show file tree

Hide file tree

Showing 12 changed files with 63 additions and 68 deletions.
diff --git a/ci/azure-windows-36.yaml b/ci/azure-windows-36.yaml
@@ -7,7 +7,6 @@ dependencies:
   - bottleneck
   - boost-cpp<1.67
   - fastparquet
-  - feather-format
   - matplotlib
   - numexpr
   - numpy=1.14*

diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
@@ -2,7 +2,6 @@ beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
 fastparquet
-feather-format
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -13,7 +12,7 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow
+pyarrow>=0.4.1
 pymysql
 pytables>=3.4.2
 pytest-cov

diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
@@ -4,7 +4,6 @@ beautifulsoup4>=4.2.1
 blosc
 bottleneck>=1.2.0
 fastparquet
-feather-format
 gcsfs
 html5lib
 ipython>=5.6.0
@@ -15,7 +14,7 @@ matplotlib>=2.0.0
 nbsphinx
 numexpr>=2.6.1
 openpyxl
-pyarrow
+pyarrow>=0.4.1
 pymysql
 tables
 pytest-cov
@@ -28,4 +27,4 @@ statsmodels
 xarray
 xlrd
 xlsxwriter
-xlwt
+xlwt
diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml
@@ -7,7 +7,6 @@ dependencies:
   - bottleneck
   - cython=0.28.2
   - fastparquet
-  - feather-format
   - gcsfs
   - html5lib
   - ipython

diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml
@@ -8,7 +8,6 @@ dependencies:
   - bottleneck
   - cython>=0.28.2
   - fastparquet
-  - feather-format
   - html5lib
   - hypothesis>=3.58.0
   - ipykernel
@@ -24,6 +23,7 @@ dependencies:
   - numpy=1.13*
   - openpyxl
   - pandoc
+  - pyarrow
   - pyqt
   - pytables
   - python-dateutil

diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml
@@ -7,7 +7,6 @@ dependencies:
   - cython>=0.28.2
   - dask
   - fastparquet
-  - feather-format
   - flake8>=3.5
   - flake8-comprehensions
   - gcsfs
@@ -23,7 +22,7 @@ dependencies:
   - numpy
   - openpyxl
   - psycopg2
-  - pyarrow
+  - pyarrow=0.9.0
   - pymysql
   - pytables
   - python-snappy

diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml
@@ -9,6 +9,7 @@ dependencies:
   - numpy
   - python-dateutil
   - nomkl
+  - pyarrow
   - pytz
   - pytest
   - pytest-xdist

diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -258,7 +258,7 @@ Optional Dependencies
 * `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
-* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
+* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
 * `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -269,6 +269,9 @@ If installed, we now require:
 | scipy           | 0.18.1          |          |
 +-----------------+-----------------+----------+
 
+Additionally we no longer depend on `feather-format` for feather based storage
+and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`).
+
 .. _whatsnew_0240.api_breaking.csv_line_terminator:
 
 `os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv``
@@ -954,6 +957,8 @@ Deprecations
 - The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`).
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`)
 - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`)
+- Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of
+  `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`)
 
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 

diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -3,38 +3,35 @@
 from distutils.version import LooseVersion
 
 from pandas.compat import range
+from pandas.util._decorators import deprecate_kwarg
 
 from pandas import DataFrame, Int64Index, RangeIndex
 
 from pandas.io.common import _stringify_path
 
 
 def _try_import():
-    # since pandas is a dependency of feather
+    # since pandas is a dependency of pyarrow
     # we need to import on first use
-
     try:
-        import feather
+        import pyarrow
+        from pyarrow import feather
     except ImportError:
-
         # give a nice error message
-        raise ImportError("the feather-format library is not installed\n"
+        raise ImportError("pyarrow is not installed\n\n"
                           "you can install via conda\n"
-                          "conda install feather-format -c conda-forge\n"
+                          "conda install pyarrow -c conda-forge\n"
                           "or via pip\n"
-                          "pip install -U feather-format\n")
+                          "pip install -U pyarrow\n")
 
-    try:
-        LooseVersion(feather.__version__) >= LooseVersion('0.3.1')
-    except AttributeError:
-        raise ImportError("the feather-format library must be >= "
-                          "version 0.3.1\n"
+    if LooseVersion(pyarrow.__version__) < LooseVersion('0.4.1'):
+        raise ImportError("pyarrow >= 0.4.1 required for feather support\n\n"
                           "you can install via conda\n"
-                          "conda install feather-format -c conda-forge"
+                          "conda install pyarrow -c conda-forge"
                           "or via pip\n"
-                          "pip install -U feather-format\n")
+                          "pip install -U pyarrow\n")
 
-    return feather
+    return feather, pyarrow
 
 
 def to_feather(df, path):
@@ -51,7 +48,7 @@ def to_feather(df, path):
     if not isinstance(df, DataFrame):
         raise ValueError("feather only support IO with DataFrames")
 
-    feather = _try_import()
+    feather = _try_import()[0]
     valid_types = {'string', 'unicode'}
 
     # validate index
@@ -83,10 +80,11 @@ def to_feather(df, path):
     if df.columns.inferred_type not in valid_types:
         raise ValueError("feather must have string column names")
 
-    feather.write_dataframe(df, path)
+    feather.write_feather(df, path)
 
 
-def read_feather(path, nthreads=1):
+@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads')
+def read_feather(path, use_threads=True):
     """
     Load a feather-format object from the file path
 
@@ -99,17 +97,25 @@ def read_feather(path, nthreads=1):
         Number of CPU threads to use when reading to pandas.DataFrame
 
        .. versionadded 0.21.0
+       .. deprecated 0.24.0
+    use_threads: bool, default True
+        Whether to parallelize reading using multiple threads
+
+       .. versionadded 0.24.0
 
     Returns
     -------
     type of object stored in file
 
     """
 
-    feather = _try_import()
+    feather, pyarrow = _try_import()
     path = _stringify_path(path)
 
-    if LooseVersion(feather.__version__) < LooseVersion('0.4.0'):
-        return feather.read_dataframe(path)
+    if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
+        int_use_threads = int(use_threads)
+        if int_use_threads < 1:
+            int_use_threads = 1
+        return feather.read_feather(path, nthreads=int_use_threads)
 
-    return feather.read_dataframe(path, nthreads=nthreads)
+    return feather.read_feather(path, use_threads=bool(use_threads))
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -135,9 +135,7 @@ def test_iterator(self):
         (pd.read_csv, 'os', FileNotFoundError, 'csv'),
         (pd.read_fwf, 'os', FileNotFoundError, 'txt'),
         (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'),
-        pytest.param(
-            pd.read_feather, 'feather', Exception, 'feather',
-            marks=pytest.mark.xfail(reason="failing for pyarrow < 0.11.0")),
+        (pd.read_feather, 'feather', Exception, 'feather'),
         (pd.read_hdf, 'tables', FileNotFoundError, 'h5'),
         (pd.read_stata, 'os', FileNotFoundError, 'dta'),
         (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'),
@@ -162,10 +160,7 @@ def test_read_non_existant_read_table(self):
         (pd.read_csv, 'os', ('io', 'data', 'iris.csv')),
         (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')),
         (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')),
-        pytest.param(
-            pd.read_feather, 'feather',
-            ('io', 'data', 'feather-0_3_1.feather'),
-            marks=pytest.mark.xfail(reason="failing for pyarrow < 0.11.0")),
+        (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')),
         (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf',
                                  'datetimetz_object.h5')),
         (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')),

diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -1,6 +1,5 @@
 """ test feather-format compat """
 from distutils.version import LooseVersion
-from warnings import catch_warnings
 
 import numpy as np
 
@@ -9,15 +8,13 @@
 from pandas.util.testing import assert_frame_equal, ensure_clean
 
 import pytest
-feather = pytest.importorskip('feather')
-from feather import FeatherError  # noqa:E402
+pyarrow = pytest.importorskip('pyarrow')
 
 from pandas.io.feather_format import to_feather, read_feather  # noqa:E402
 
-fv = LooseVersion(feather.__version__)
+pyarrow_version = LooseVersion(pyarrow.__version__)
 
 
-@pytest.mark.xfail(reason="failing for pyarrow < 0.11.0")
 @pytest.mark.single
 class TestFeather(object):
 
@@ -34,8 +31,7 @@ def check_round_trip(self, df, **kwargs):
         with ensure_clean() as path:
             to_feather(df, path)
 
-            with catch_warnings(record=True):
-                result = read_feather(path, **kwargs)
+            result = read_feather(path, **kwargs)
             assert_frame_equal(result, df)
 
     def test_error(self):
@@ -65,13 +61,6 @@ def test_basic(self):
         assert df.dttz.dtype.tz.zone == 'US/Eastern'
         self.check_round_trip(df)
 
-    @pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0')
-    def test_strided_data_issues(self):
-
-        # strided data issuehttps://github.com/wesm/feather/issues/97
-        df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('abc'))
-        self.check_error_on_write(df, FeatherError)
-
     def test_duplicate_columns(self):
 
         # https://github.com/wesm/feather/issues/53
@@ -85,29 +74,33 @@ def test_stringify_columns(self):
         df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
         self.check_error_on_write(df, ValueError)
 
-    @pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0')
-    def test_unsupported(self):
-
-        # timedelta
-        df = pd.DataFrame({'a': pd.timedelta_range('1 day', periods=3)})
-        self.check_error_on_write(df, FeatherError)
-
-        # non-strings
-        df = pd.DataFrame({'a': ['a', 1, 2.0]})
-        self.check_error_on_write(df, ValueError)
-
     def test_unsupported_other(self):
 
         # period
         df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
         # Some versions raise ValueError, others raise ArrowInvalid.
         self.check_error_on_write(df, Exception)
 
-    @pytest.mark.skipif(fv < LooseVersion('0.4.0'), reason='new in 0.4.0')
     def test_rw_nthreads(self):
-
         df = pd.DataFrame({'A': np.arange(100000)})
-        self.check_round_trip(df, nthreads=2)
+        expected_warning = (
+            "the 'nthreads' keyword is deprecated, "
+            "use 'use_threads' instead"
+        )
+        with tm.assert_produces_warning(FutureWarning) as w:
+            self.check_round_trip(df, nthreads=2)
+        assert len(w) == 1
+        assert expected_warning in str(w[0])
+
+        with tm.assert_produces_warning(FutureWarning) as w:
+            self.check_round_trip(df, nthreads=1)
+        assert len(w) == 1
+        assert expected_warning in str(w[0])
+
+    def test_rw_use_threads(self):
+        df = pd.DataFrame({'A': np.arange(100000)})
+        self.check_round_trip(df, use_threads=True)
+        self.check_round_trip(df, use_threads=False)
 
     def test_write_with_index(self):
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ dependencies: @@
       - numpy
       - python-dateutil
       - nomkl
+      - pyarrow
       - pytz
       - pytest
       - pytest-xdist
@@ Expand Down @@