From 956a87f4d871e4c9c8061e462f0d0efc4b140461 Mon Sep 17 00:00:00 2001 From: winlu Date: Thu, 1 Nov 2018 13:02:40 +0100 Subject: [PATCH] BUG: Deprecate nthreads argument (#23112) The nthreads argument is no longer supported since pyarrow 0.11.0 and was replaced with use_threads. Hence we deprecate the argument now as well so we can remove it in the future. This commit also: - removes feather-format as a dependency and replaces it with usage of pyarrow directly. - sets CI dependencies to respect the changes above. We test backwards compatibility with pyarrow 0.9.0 as conda does not provide a pyarrow 0.10.0 and the conda-forge version has comatibility issues with the rest of the installed packages. Resolves #23053. Resolves #21639. --- ci/azure-windows-36.yaml | 1 - ci/requirements-optional-conda.txt | 3 +- ci/requirements-optional-pip.txt | 5 ++- ci/travis-27.yaml | 1 - ci/travis-36-doc.yaml | 2 +- ci/travis-36.yaml | 3 +- ci/travis-37.yaml | 1 + doc/source/install.rst | 2 +- doc/source/whatsnew/v0.24.0.txt | 5 +++ pandas/io/feather_format.py | 50 +++++++++++++++++------------- pandas/tests/io/test_common.py | 9 ++---- pandas/tests/io/test_feather.py | 49 +++++++++++++---------------- 12 files changed, 63 insertions(+), 68 deletions(-) diff --git a/ci/azure-windows-36.yaml b/ci/azure-windows-36.yaml index 979443661f99bf..af42545af79714 100644 --- a/ci/azure-windows-36.yaml +++ b/ci/azure-windows-36.yaml @@ -7,7 +7,6 @@ dependencies: - bottleneck - boost-cpp<1.67 - fastparquet - - feather-format - matplotlib - numexpr - numpy=1.14* diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index 04abfede671632..c9dc385b879863 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -2,7 +2,6 @@ beautifulsoup4>=4.2.1 blosc bottleneck>=1.2.0 fastparquet -feather-format gcsfs html5lib ipython>=5.6.0 @@ -13,7 +12,7 @@ matplotlib>=2.0.0 nbsphinx numexpr>=2.6.1 openpyxl -pyarrow +pyarrow>=0.4.1 pymysql pytables>=3.4.2 pytest-cov diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 0153bdb6edf04c..347ea0d9832b04 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -4,7 +4,6 @@ beautifulsoup4>=4.2.1 blosc bottleneck>=1.2.0 fastparquet -feather-format gcsfs html5lib ipython>=5.6.0 @@ -15,7 +14,7 @@ matplotlib>=2.0.0 nbsphinx numexpr>=2.6.1 openpyxl -pyarrow +pyarrow>=0.4.1 pymysql tables pytest-cov @@ -28,4 +27,4 @@ statsmodels xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 8955bea1fc0106..9641a76152d7b0 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -7,7 +7,6 @@ dependencies: - bottleneck - cython=0.28.2 - fastparquet - - feather-format - gcsfs - html5lib - ipython diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index f1f64546374af2..ce095b887f1894 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -8,7 +8,6 @@ dependencies: - bottleneck - cython>=0.28.2 - fastparquet - - feather-format - html5lib - hypothesis>=3.58.0 - ipykernel @@ -24,6 +23,7 @@ dependencies: - numpy=1.13* - openpyxl - pandoc + - pyarrow - pyqt - pytables - python-dateutil diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index 257f830ec6c481..352717a8422145 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -7,7 +7,6 @@ dependencies: - cython>=0.28.2 - dask - fastparquet - - feather-format - flake8>=3.5 - flake8-comprehensions - gcsfs @@ -23,7 +22,7 @@ dependencies: - numpy - openpyxl - psycopg2 - - pyarrow + - pyarrow=0.9.0 - pymysql - pytables - python-snappy diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml index 4f2138d8555e34..7dbd85ac27df69 100644 --- a/ci/travis-37.yaml +++ b/ci/travis-37.yaml @@ -9,6 +9,7 @@ dependencies: - numpy - python-dateutil - nomkl + - pyarrow - pytz - pytest - pytest-xdist diff --git a/doc/source/install.rst b/doc/source/install.rst index 843384b680cf8f..b32c5b1145e85e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -258,7 +258,7 @@ Optional Dependencies * `SciPy `__: miscellaneous statistical functions, Version 0.18.1 or higher * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage, Version 3.4.2 or higher -* `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. +* `pyarrow `__ (>= 0.4.1): necessary for feather-based storage. * `Apache Parquet `__, either `pyarrow `__ (>= 0.4.1) or `fastparquet `__ (>= 0.0.6) for parquet-based storage. The `snappy `__ and `brotli `__ are available for compression support. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9f1d3574257faa..9f69bdc9c040c0 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -269,6 +269,9 @@ If installed, we now require: | scipy | 0.18.1 | | +-----------------+-----------------+----------+ +Additionally we no longer depend on `feather-format` for feather based storage +and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`). + .. _whatsnew_0240.api_breaking.csv_line_terminator: `os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` @@ -955,6 +958,8 @@ Deprecations - The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have deprecated the ``errors`` argument in favor of the ``nonexistent`` argument (:issue:`8917`) - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) +- Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of + `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) .. _whatsnew_0240.deprecations.datetimelike_int_ops: diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 8d2715fe5beedd..ea2d96cd896d9b 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -3,6 +3,7 @@ from distutils.version import LooseVersion from pandas.compat import range +from pandas.util._decorators import deprecate_kwarg from pandas import DataFrame, Int64Index, RangeIndex @@ -10,31 +11,27 @@ def _try_import(): - # since pandas is a dependency of feather + # since pandas is a dependency of pyarrow # we need to import on first use - try: - import feather + import pyarrow + from pyarrow import feather except ImportError: - # give a nice error message - raise ImportError("the feather-format library is not installed\n" + raise ImportError("pyarrow is not installed\n\n" "you can install via conda\n" - "conda install feather-format -c conda-forge\n" + "conda install pyarrow -c conda-forge\n" "or via pip\n" - "pip install -U feather-format\n") + "pip install -U pyarrow\n") - try: - LooseVersion(feather.__version__) >= LooseVersion('0.3.1') - except AttributeError: - raise ImportError("the feather-format library must be >= " - "version 0.3.1\n" + if LooseVersion(pyarrow.__version__) < LooseVersion('0.4.1'): + raise ImportError("pyarrow >= 0.4.1 required for feather support\n\n" "you can install via conda\n" - "conda install feather-format -c conda-forge" + "conda install pyarrow -c conda-forge" "or via pip\n" - "pip install -U feather-format\n") + "pip install -U pyarrow\n") - return feather + return feather, pyarrow def to_feather(df, path): @@ -51,7 +48,7 @@ def to_feather(df, path): if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") - feather = _try_import() + feather = _try_import()[0] valid_types = {'string', 'unicode'} # validate index @@ -83,10 +80,11 @@ def to_feather(df, path): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_dataframe(df, path) + feather.write_feather(df, path) -def read_feather(path, nthreads=1): +@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads') +def read_feather(path, use_threads=True): """ Load a feather-format object from the file path @@ -99,6 +97,11 @@ def read_feather(path, nthreads=1): Number of CPU threads to use when reading to pandas.DataFrame .. versionadded 0.21.0 + .. deprecated 0.24.0 + use_threads: bool, default True + Whether to parallelize reading using multiple threads + + .. versionadded 0.24.0 Returns ------- @@ -106,10 +109,13 @@ def read_feather(path, nthreads=1): """ - feather = _try_import() + feather, pyarrow = _try_import() path = _stringify_path(path) - if LooseVersion(feather.__version__) < LooseVersion('0.4.0'): - return feather.read_dataframe(path) + if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'): + int_use_threads = int(use_threads) + if int_use_threads < 1: + int_use_threads = 1 + return feather.read_feather(path, nthreads=int_use_threads) - return feather.read_dataframe(path, nthreads=nthreads) + return feather.read_feather(path, use_threads=bool(use_threads)) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 88a2fded3500c5..73e29e6eb9a6a1 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -135,9 +135,7 @@ def test_iterator(self): (pd.read_csv, 'os', FileNotFoundError, 'csv'), (pd.read_fwf, 'os', FileNotFoundError, 'txt'), (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), - pytest.param( - pd.read_feather, 'feather', Exception, 'feather', - marks=pytest.mark.xfail(reason="failing for pyarrow < 0.11.0")), + (pd.read_feather, 'feather', Exception, 'feather'), (pd.read_hdf, 'tables', FileNotFoundError, 'h5'), (pd.read_stata, 'os', FileNotFoundError, 'dta'), (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'), @@ -162,10 +160,7 @@ def test_read_non_existant_read_table(self): (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), - pytest.param( - pd.read_feather, 'feather', - ('io', 'data', 'feather-0_3_1.feather'), - marks=pytest.mark.xfail(reason="failing for pyarrow < 0.11.0")), + (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', 'datetimetz_object.h5')), (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 82f9f7253e65ca..16b59526c8233b 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,6 +1,5 @@ """ test feather-format compat """ from distutils.version import LooseVersion -from warnings import catch_warnings import numpy as np @@ -9,15 +8,13 @@ from pandas.util.testing import assert_frame_equal, ensure_clean import pytest -feather = pytest.importorskip('feather') -from feather import FeatherError # noqa:E402 +pyarrow = pytest.importorskip('pyarrow') from pandas.io.feather_format import to_feather, read_feather # noqa:E402 -fv = LooseVersion(feather.__version__) +pyarrow_version = LooseVersion(pyarrow.__version__) -@pytest.mark.xfail(reason="failing for pyarrow < 0.11.0") @pytest.mark.single class TestFeather(object): @@ -34,8 +31,7 @@ def check_round_trip(self, df, **kwargs): with ensure_clean() as path: to_feather(df, path) - with catch_warnings(record=True): - result = read_feather(path, **kwargs) + result = read_feather(path, **kwargs) assert_frame_equal(result, df) def test_error(self): @@ -65,13 +61,6 @@ def test_basic(self): assert df.dttz.dtype.tz.zone == 'US/Eastern' self.check_round_trip(df) - @pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0') - def test_strided_data_issues(self): - - # strided data issuehttps://github.com/wesm/feather/issues/97 - df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('abc')) - self.check_error_on_write(df, FeatherError) - def test_duplicate_columns(self): # https://github.com/wesm/feather/issues/53 @@ -85,17 +74,6 @@ def test_stringify_columns(self): df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() self.check_error_on_write(df, ValueError) - @pytest.mark.skipif(fv >= LooseVersion('0.4.0'), reason='fixed in 0.4.0') - def test_unsupported(self): - - # timedelta - df = pd.DataFrame({'a': pd.timedelta_range('1 day', periods=3)}) - self.check_error_on_write(df, FeatherError) - - # non-strings - df = pd.DataFrame({'a': ['a', 1, 2.0]}) - self.check_error_on_write(df, ValueError) - def test_unsupported_other(self): # period @@ -103,11 +81,26 @@ def test_unsupported_other(self): # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) - @pytest.mark.skipif(fv < LooseVersion('0.4.0'), reason='new in 0.4.0') def test_rw_nthreads(self): - df = pd.DataFrame({'A': np.arange(100000)}) - self.check_round_trip(df, nthreads=2) + expected_warning = ( + "the 'nthreads' keyword is deprecated, " + "use 'use_threads' instead" + ) + with tm.assert_produces_warning(FutureWarning) as w: + self.check_round_trip(df, nthreads=2) + assert len(w) == 1 + assert expected_warning in str(w[0]) + + with tm.assert_produces_warning(FutureWarning) as w: + self.check_round_trip(df, nthreads=1) + assert len(w) == 1 + assert expected_warning in str(w[0]) + + def test_rw_use_threads(self): + df = pd.DataFrame({'A': np.arange(100000)}) + self.check_round_trip(df, use_threads=True) + self.check_round_trip(df, use_threads=False) def test_write_with_index(self):