From c3dd1d66c758c70b2810febafd1f3743d79c3428 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 25 May 2023 10:46:30 -0400 Subject: [PATCH] Changes to support Numpy >= 1.24 (#13325) Closes https://github.com/rapidsai/cudf/issues/13301 Authors: - Ashwin Srinath (https://github.com/shwina) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/13325 --- .../all_cuda-118_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- dependencies.yaml | 4 ++-- python/cudf/cudf/core/column/numerical.py | 10 ++++---- python/cudf/cudf/tests/test_column.py | 4 ++-- python/cudf/cudf/tests/test_csv.py | 6 +++-- python/cudf/cudf/tests/test_feather.py | 12 ++++------ python/cudf/cudf/tests/test_json.py | 12 ++++------ python/cudf/cudf/tests/test_numerical.py | 6 ++++- python/cudf/cudf/tests/test_parquet.py | 24 +++++++------------ python/cudf/cudf/tests/test_rank.py | 12 +++++----- python/cudf/cudf/tests/test_replace.py | 13 +++++++--- python/cudf/cudf/tests/test_sparse_df.py | 4 ++-- python/cudf/cudf/tests/test_unaops.py | 5 ++-- python/cudf/cudf/utils/queryutils.py | 3 ++- python/cudf/pyproject.toml | 4 ++-- python/cudf_kafka/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 18 files changed, 64 insertions(+), 63 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b6daea7c2bc..9eb0bd21e71 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -49,7 +49,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.21,<1.24 +- numpy>=1.21 - numpydoc - nvcc_linux-64=11.8 - nvtx>=0.2.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 6a0faa0ebbc..47110fd2990 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -65,7 +65,7 @@ requirements: - pandas >=1.3,<1.6.0dev0 - cupy >=12.0.0 - numba >=0.57 - - numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations + - numpy >=1.21 - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} - libcudf {{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/dependencies.yaml b/dependencies.yaml index e3fcbe69932..066a38a67bb 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -234,7 +234,7 @@ dependencies: # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==11.0.0.* - - numpy>=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations + - numpy>=1.21 build_python: common: - output_types: [conda, requirements, pyproject] @@ -342,7 +342,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - numpy>=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations + - numpy>=1.21 - pandas>=1.3,<1.6.0dev0 run_cudf: common: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 39798320f88..c03794d5e5e 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -768,10 +768,12 @@ def _normalize_find_and_replace_input( if len(col_to_normalize) == 1: if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): return normalized_column.astype(input_column_dtype) - else: - col_to_normalize_casted = input_column_dtype.type( - col_to_normalize[0] - ) + if np.isinf(col_to_normalize[0]): + return normalized_column + col_to_normalize_casted = np.array(col_to_normalize[0]).astype( + input_column_dtype + ) + if not np.isnan(col_to_normalize_casted) and ( col_to_normalize_casted != col_to_normalize[0] ): diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index a15afa727c0..db0446d506c 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -398,8 +398,8 @@ def test_column_view_string_slice(slc): cudf.core.column.as_column([], dtype="uint8"), ), ( - cp.array([453], dtype="uint8"), - cudf.core.column.as_column([453], dtype="uint8"), + cp.array([255], dtype="uint8"), + cudf.core.column.as_column([255], dtype="uint8"), ), ], ) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 4a7804da62c..b34340295f2 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -151,7 +151,7 @@ def make_all_numeric_extremes_dataframe(): if np.issubdtype(np_type, np.integer): itype = np.iinfo(np_type) extremes = [0, +1, -1, itype.min, itype.max] - df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20] + df[gdf_dtype] = np.array(extremes * 4).astype(np_type)[:20] else: ftype = np.finfo(np_type) extremes = [ @@ -324,6 +324,7 @@ def test_csv_reader_dtype_dict(use_names): assert_eq(gdf, pdf) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize("use_names", [True, False]) def test_csv_reader_dtype_extremes(use_names): # Save with the column header if not explicitly specifying a list of names @@ -1433,7 +1434,7 @@ def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - expected = np.array(values, dtype=np_dtype) + expected = np.array(values).astype(np_dtype) actual = gdf["hex_int"].to_numpy() np.testing.assert_array_equal(expected, actual) @@ -2149,6 +2150,7 @@ def test_default_integer_bitwidth_partial( ) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_default_integer_bitwidth_extremes( cudf_extreme_numeric_dataframe, default_integer_bitwidth ): diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 6cdf47ed948..12a325fa4e8 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -15,23 +15,19 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): types = NUMERIC_TYPES + ["bool"] - typer = {"col_" + val: val for val in types} - ncols = len(types) nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], - columns=pd.Index([f"col_{typ}" for typ in types], name="foo"), + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + } ) # Delete the name of the column index, and rename the row index test_pdf.columns.name = None test_pdf.index.name = "index" - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype("object").astype(typer) - # Create non-numeric categorical data otherwise may get typecasted data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] test_pdf["col_category"] = pd.Series(data, dtype="category") diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 8dcab37d20a..43b0ca0119a 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -31,23 +31,19 @@ def make_numeric_dataframe(nrows, dtype): @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] - typer = {"col_" + val: val for val in types} - ncols = len(types) nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], - columns=pd.Index([f"col_{typ}" for typ in types], name="foo"), + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + } ) # Delete the name of the column index, and rename the row index test_pdf.columns.name = None test_pdf.index.name = "test_index" - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype("object").astype(typer) - return test_pdf diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index e2fbd55c051..5bb55c164fe 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -194,6 +194,7 @@ def test_to_numeric_downcast_int(data, downcast): assert_eq(expected, got) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize( "data", [ @@ -223,6 +224,7 @@ def test_to_numeric_downcast_float(data, downcast): assert_eq(expected, got) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize( "data", [ @@ -245,6 +247,7 @@ def test_to_numeric_downcast_large_float(data, downcast): assert_eq(expected, got) +@pytest.mark.filterwarnings("ignore:overflow encountered in cast") @pytest.mark.parametrize( "data", [ @@ -325,6 +328,7 @@ def test_to_numeric_downcast_string_float(data, downcast): assert_eq(expected, got) +@pytest.mark.filterwarnings("ignore:overflow encountered in cast") @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 7aeb2afc2c7..a9cf657aaa4 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -69,14 +69,14 @@ def simple_pdf(request): "float32", "float64", ] - typer = {"col_" + val: val for val in types} - ncols = len(types) nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], - columns=pd.Index([f"col_{typ}" for typ in types], name="foo"), + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + }, # Need to ensure that this index is not a RangeIndex to get the # expected round-tripping behavior from Parquet reader/writer. index=pd.Index(list(range(nrows))), @@ -85,10 +85,6 @@ def simple_pdf(request): test_pdf.columns.name = None test_pdf.index.name = "test_index" - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype("object").astype(typer) - return test_pdf @@ -115,14 +111,14 @@ def build_pdf(num_columns, day_resolution_timestamps): "datetime64[us]", "str", ] - typer = {"col_" + val: val for val in types} - ncols = len(types) nrows = num_columns.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], - columns=pd.Index([f"col_{typ}" for typ in types], name="foo"), + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + }, # Need to ensure that this index is not a RangeIndex to get the # expected round-tripping behavior from Parquet reader/writer. index=pd.Index(list(range(nrows))), @@ -131,10 +127,6 @@ def build_pdf(num_columns, day_resolution_timestamps): test_pdf.columns.name = None test_pdf.index.name = "test_index" - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype(typer) - # make datetime64's a little more interesting by increasing the range of # dates note that pandas will convert these to ns timestamps, so care is # taken to avoid overflowing a ns timestamp. There is also the ability to diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 9bd67309ece..f8a8903b518 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from itertools import chain, combinations_with_replacement, product @@ -128,6 +128,7 @@ def test_rank_error_arguments(pdf): sort_dtype_args = [np.int32, np.int64, np.float32, np.float64] +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize( "elem,dtype", list( @@ -139,13 +140,12 @@ def test_rank_error_arguments(pdf): ) def test_series_rank_combinations(elem, dtype): np.random.seed(0) + aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype) gdf = DataFrame() - gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype( - dtype - ) - ranked_gs = gdf["a"].rank(method="first") df = pd.DataFrame() + gdf["a"] = aa df["a"] = aa + ranked_gs = gdf["a"].rank(method="first") ranked_ps = df["a"].rank(method="first") # Check - assert_eq(ranked_ps, ranked_gs.to_pandas()) + assert_eq(ranked_ps, ranked_gs) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 9e93dd6d227..13e44e7cf59 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -944,8 +944,15 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) + if sr.dtype.kind in "ui": + can_replace = np.array([replacement])[0].is_integer() and np.can_cast( + int(replacement), sr.dtype + ) + else: + can_replace = np.can_cast(replacement, sr.dtype) + # Both Scalar - if sr.dtype.type(replacement) != replacement: + if not can_replace: with pytest.raises(TypeError): sr.replace(1, replacement) else: @@ -954,7 +961,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): assert_eq(expect, got) # to_replace is a list, replacement is a scalar - if sr.dtype.type(replacement) != replacement: + if not can_replace: with pytest.raises(TypeError): sr.replace([2, 3], replacement) @@ -974,7 +981,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): # Both lists of equal length if ( np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"} - ) or (sr.dtype.type(replacement) != replacement): + ) or (not can_replace): with pytest.raises(TypeError): sr.replace([2, 3], [replacement, replacement]) else: diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 0dd47c219c0..3248e7f72c0 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import numpy as np @@ -7,7 +7,7 @@ def test_to_dense_array(): data = np.random.random(8) - mask = np.asarray([0b11010110], dtype=np.byte) + mask = np.asarray([0b11010110]).astype(np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) assert sr.has_nulls diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 3f2f2072758..a7502051a78 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import itertools import operator @@ -77,9 +77,10 @@ def generate_valid_scalar_unaop_combos(): return results +@pytest.mark.filterwarnings("ignore:overflow encountered in scalar negative") @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos()) def test_scalar_unary_operations(slr, dtype, op): - slr_host = cudf.dtype(dtype).type(slr) + slr_host = np.array([slr])[0].astype(cudf.dtype(dtype)) slr_device = cudf.Scalar(slr, dtype=dtype) expect = op(slr_host) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 51093375eda..60b2eebdf1b 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -138,7 +138,8 @@ def query_compile(expr): key "args" is a sequence of name of the arguments. """ - funcid = f"queryexpr_{np.uintp(hash(expr)):x}" + # hash returns in the semi-open interval [-2**63, 2**63) + funcid = f"queryexpr_{(hash(expr) + 2**63):x}" # Load cache compiled = _cache.get(funcid) # Cache not found diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index b08dd92d52f..9fe0383a6fb 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cmake>=3.23.1,!=3.25.0", "cython>=0.29,<0.30", "ninja", - "numpy>=1.21,<1.24", + "numpy>=1.21", "protoc-wheel", "pyarrow==11.0.0.*", "rmm==23.6.*", @@ -32,7 +32,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numba>=0.57", - "numpy>=1.21,<1.24", + "numpy>=1.21", "nvtx>=0.2.1", "packaging", "pandas>=1.3,<1.6.0dev0", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index df0825c846a..d8b97f52864 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "cython>=0.29,<0.30", - "numpy>=1.21,<1.24", + "numpy>=1.21", "pyarrow==11.0.0.*", "setuptools", "wheel", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 42b6c26c002..ca97d7ca2fa 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "dask==2023.3.2", "distributed==2023.3.2.1", "fsspec>=0.6.0", - "numpy>=1.21,<1.24", + "numpy>=1.21", "pandas>=1.3,<1.6.0dev0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [