From 47be09fd744f64cae23d90bc53dfdcffd2eb2aa6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 30 Jan 2022 22:20:11 -0600 Subject: [PATCH 01/10] Fix warnings for ceil/floor. --- python/cudf/cudf/core/frame.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 891f58657b0..bc666430189 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3199,9 +3199,9 @@ def ceil(self): """ warnings.warn( - "Series.ceil and DataFrame.ceil are deprecated and will be \ - removed in the future", - DeprecationWarning, + "Series.ceil and DataFrame.ceil are deprecated and will be " + "removed in the future", + FutureWarning, ) return self._unaryop("ceil") @@ -3238,9 +3238,9 @@ def floor(self): """ warnings.warn( - "Series.ceil and DataFrame.ceil are deprecated and will be \ - removed in the future", - DeprecationWarning, + "Series.floor and DataFrame.floor are deprecated and will be " + "removed in the future.", + FutureWarning, ) return self._unaryop("floor") From 51d4b744e626a282b852e1ede34111960256d415 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 30 Jan 2022 22:25:08 -0600 Subject: [PATCH 02/10] Fix warnings in test_unaops.py. --- python/cudf/cudf/tests/test_unaops.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 22c78b5f933..e79b74e3aab 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -56,13 +56,21 @@ def test_series_neg(): def test_series_ceil(): arr = np.random.random(100) * 100 sr = Series(arr) - np.testing.assert_equal(sr.ceil().to_numpy(), np.ceil(arr)) + with pytest.warns( + FutureWarning, match="Series.ceil and DataFrame.ceil are deprecated" + ): + sr = sr.ceil() + np.testing.assert_equal(sr.to_numpy(), np.ceil(arr)) def test_series_floor(): arr = np.random.random(100) * 100 sr = Series(arr) - np.testing.assert_equal(sr.floor().to_numpy(), np.floor(arr)) + with pytest.warns( + FutureWarning, match="Series.floor and DataFrame.floor are deprecated" + ): + sr = sr.floor() + np.testing.assert_equal(sr.to_numpy(), np.floor(arr)) @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128]) @@ -74,7 +82,10 @@ def test_validity_ceil(nelem): sr = Series.from_masked_array(data, mask) # Result - res = sr.ceil() + with pytest.warns( + FutureWarning, match="Series.ceil and DataFrame.ceil are deprecated" + ): + res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_numpy() From b6428b27c50d2b034740d7eadfbd584ab2c6d537 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 30 Jan 2022 23:29:29 -0600 Subject: [PATCH 03/10] Silence warning from pyarrow 5.0.0. Appears to be fixed in pyarrow 6.0.1 (#9686). --- python/cudf/cudf/core/column/column.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5d694dac255..2866c1c003c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2090,16 +2090,24 @@ def as_column( dtype = "bool" np_type = np.dtype(dtype).type pa_type = np_to_pa_dtype(np.dtype(dtype)) - data = as_column( - pa.array( + # TODO: A warning is emitted from pyarrow 5.0.0's function + # pyarrow.lib._sequence_to_array: + # "DeprecationWarning: an integer is required (got type float). + # Implicit conversion to integers using __int__ is deprecated, + # and may be removed in a future version of Python." + # This warning does not appear in pyarrow 6.0.1 and will be + # resolved by https://github.com/rapidsai/cudf/pull/9686/. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + pa_array = pa.array( arbitrary, type=pa_type, from_pandas=True if nan_as_null is None else nan_as_null, - ), - dtype=dtype, - nan_as_null=nan_as_null, + ) + data = as_column( + pa_array, dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): From dbfa6510db8a3ffccd48240aec1c939e3e533a31 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 31 Jan 2022 09:18:58 -0600 Subject: [PATCH 04/10] Generate random values appropriate for the given dtype. --- python/cudf/cudf/tests/test_column.py | 31 +++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 748cf958ac3..915901955d0 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -29,8 +29,35 @@ @pytest.fixture(params=dtypes, ids=dtypes) def pandas_input(request): - data = np.random.randint(0, 1000, 100) - return pd.Series(data, dtype=request.param) + rng = np.random.default_rng() + dtype = request.param + size = 100 + + def random_ints(dtype, size): + dtype_min = np.iinfo(dtype).min + dtype_max = np.iinfo(dtype).max + return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype) + + try: + dtype = np.dtype(request.param) + except TypeError: + if dtype == "category": + data = random_ints(np.int64, size) + else: + if dtype.kind == "b": + data = np.random.choice([False, True], size=size) + elif dtype.kind in ("m", "M"): + # datetime or timedelta + data = random_ints(np.int64, size).astype(dtype.str) + elif dtype.kind == "U": + # Unicode strings of integers like "12345" + data = random_ints(np.int64, size).astype(dtype.str) + elif dtype.kind == "f": + # floats in [0.0, 1.0) + data = rng.random(size=size, dtype=dtype) + else: + data = random_ints(dtype, size) + return pd.Series(data, dtype=dtype) def str_host_view(list_of_str, to_dtype): From 47669c26d7e289292f575d4106358338a450cec1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 31 Jan 2022 09:24:51 -0600 Subject: [PATCH 05/10] Avoid creating GPU objects in parametrize. --- python/cudf/cudf/tests/test_column.py | 29 ++++++++++----------------- 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 915901955d0..5ed8a0415cf 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -406,29 +406,22 @@ def test_as_column_buffer(data, expected): @pytest.mark.parametrize( - "data,expected", + "data,pyarrow_kwargs,cudf_kwargs", [ ( - pa.array([100, 200, 300], type=pa.decimal128(3)), - cudf.core.column.as_column( - [100, 200, 300], dtype=cudf.core.dtypes.Decimal128Dtype(3, 0) - ), - ), - ( - pa.array([{"a": 1, "b": 3}, {"c": 2, "d": 4}]), - cudf.core.column.as_column([{"a": 1, "b": 3}, {"c": 2, "d": 4}]), - ), - ( - pa.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), - cudf.core.column.as_column( - [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] - ), + [100, 200, 300], + dict(type=pa.decimal128(3)), + dict(dtype=cudf.core.dtypes.Decimal128Dtype(3, 0)), ), + ([{"a": 1, "b": 3}, {"c": 2, "d": 4}], dict(), dict(),), + ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dict(), dict(),), ], ) -def test_as_column_arrow_array(data, expected): - actual_column = cudf.core.column.as_column(data) - assert_eq(cudf.Series(actual_column), cudf.Series(expected)) +def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): + pyarrow_data = pa.array(data, **pyarrow_kwargs) + cudf_from_pyarrow = cudf.core.column.as_column(pyarrow_data) + expected = cudf.core.column.as_column(data, **cudf_kwargs) + assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected)) @pytest.mark.parametrize( From d2c0c4bc4bcd67c1d61d8f1a7291bc83d59f8ac6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 31 Jan 2022 17:40:49 -0600 Subject: [PATCH 06/10] Update python/cudf/cudf/tests/test_column.py --- python/cudf/cudf/tests/test_column.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 5ed8a0415cf..dc7fa4540d2 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -43,6 +43,8 @@ def random_ints(dtype, size): except TypeError: if dtype == "category": data = random_ints(np.int64, size) + else: + raise else: if dtype.kind == "b": data = np.random.choice([False, True], size=size) From 99006cecf02b6040c78c3750d0de5b45145ad870 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 2 Feb 2022 23:33:02 -0600 Subject: [PATCH 07/10] Silence warning from pandas.testing.assert_series_equal. --- python/cudf/cudf/testing/_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 41dac26edf8..6c602d321eb 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import re +import warnings from collections.abc import Mapping, Sequence from contextlib import contextmanager from decimal import Decimal @@ -109,7 +110,15 @@ def assert_eq(left, right, **kwargs): if isinstance(left, pd.DataFrame): tm.assert_frame_equal(left, right, **kwargs) elif isinstance(left, pd.Series): - tm.assert_series_equal(left, right, **kwargs) + # TODO: A warning is emitted from the function + # pandas.testing.assert_series_equal for some inputs: + # "DeprecationWarning: elementwise comparison failed; this will raise + # an error in the future." + # This warning comes from a call from pandas to numpy. It is ignored + # here because it cannot be fixed within cudf. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + tm.assert_series_equal(left, right, **kwargs) elif isinstance(left, pd.Index): tm.assert_index_equal(left, right, **kwargs) elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray): From d8a0f1bcea1c7cc821c9814aba7002ede9b69e29 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 3 Feb 2022 19:31:31 -0600 Subject: [PATCH 08/10] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/tests/test_column.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index dc7fa4540d2..6aa356c6aa9 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -43,14 +43,13 @@ def random_ints(dtype, size): except TypeError: if dtype == "category": data = random_ints(np.int64, size) - else: - raise + raise else: if dtype.kind == "b": - data = np.random.choice([False, True], size=size) + data = rng.choice([False, True], size=size) elif dtype.kind in ("m", "M"): # datetime or timedelta - data = random_ints(np.int64, size).astype(dtype.str) + data = random_ints(np.int64, size) elif dtype.kind == "U": # Unicode strings of integers like "12345" data = random_ints(np.int64, size).astype(dtype.str) @@ -412,17 +411,17 @@ def test_as_column_buffer(data, expected): [ ( [100, 200, 300], - dict(type=pa.decimal128(3)), - dict(dtype=cudf.core.dtypes.Decimal128Dtype(3, 0)), + {"type": pa.decimal128(3)}, + {"dtype": cudf.core.dtypes.Decimal128Dtype(3, 0)}), ), - ([{"a": 1, "b": 3}, {"c": 2, "d": 4}], dict(), dict(),), - ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dict(), dict(),), + ([{"a": 1, "b": 3}, {"c": 2, "d": 4}], {}, {},), + ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], {}, {},), ], ) def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): pyarrow_data = pa.array(data, **pyarrow_kwargs) - cudf_from_pyarrow = cudf.core.column.as_column(pyarrow_data) - expected = cudf.core.column.as_column(data, **cudf_kwargs) + cudf_from_pyarrow = as_column(pyarrow_data) + expected = as_column(data, **cudf_kwargs) assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected)) From e6a92dc5952cdae7e3707d82edd7e9e80cfa5337 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 3 Feb 2022 19:46:29 -0600 Subject: [PATCH 09/10] Fix parentheses. --- python/cudf/cudf/tests/test_column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 6aa356c6aa9..c039bf0a1ca 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -412,7 +412,7 @@ def test_as_column_buffer(data, expected): ( [100, 200, 300], {"type": pa.decimal128(3)}, - {"dtype": cudf.core.dtypes.Decimal128Dtype(3, 0)}), + {"dtype": cudf.core.dtypes.Decimal128Dtype(3, 0)}, ), ([{"a": 1, "b": 3}, {"c": 2, "d": 4}], {}, {},), ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], {}, {},), From a2fbb646feed6470d11df07b7b9c8033ed3ad8bb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 4 Feb 2022 08:24:16 -0600 Subject: [PATCH 10/10] Fix test function. --- python/cudf/cudf/tests/test_column.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index c039bf0a1ca..365b351061d 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -29,8 +29,8 @@ @pytest.fixture(params=dtypes, ids=dtypes) def pandas_input(request): - rng = np.random.default_rng() dtype = request.param + rng = np.random.default_rng() size = 100 def random_ints(dtype, size): @@ -39,11 +39,12 @@ def random_ints(dtype, size): return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype) try: - dtype = np.dtype(request.param) + dtype = np.dtype(dtype) except TypeError: if dtype == "category": data = random_ints(np.int64, size) - raise + else: + raise else: if dtype.kind == "b": data = rng.choice([False, True], size=size)