Changes to support Numpy >= 1.24 (#13325)

Closes #13301 Authors: - Ashwin Srinath (https://github.com/shwina) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Lawrence Mitchell (https://github.com/wence-) URL: #13325
rapidsai · May 25, 2023 · c3dd1d6 · c3dd1d6
1 parent ae375d2
commit c3dd1d6
Show file tree

Hide file tree

Showing 18 changed files with 64 additions and 63 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -49,7 +49,7 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy>=1.21,<1.24
+- numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
 - nvtx>=0.2.1

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - pandas >=1.3,<1.6.0dev0
     - cupy >=12.0.0
     - numba >=0.57
-    - numpy >=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
+    - numpy >=1.21
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -234,7 +234,7 @@ dependencies:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==11.0.0.*
-          - numpy>=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
+          - numpy>=1.21
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -342,7 +342,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - numpy>=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
+          - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
   run_cudf:
     common:

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -768,10 +768,12 @@ def _normalize_find_and_replace_input(
         if len(col_to_normalize) == 1:
             if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
                 return normalized_column.astype(input_column_dtype)
-            else:
-                col_to_normalize_casted = input_column_dtype.type(
-                    col_to_normalize[0]
-                )
+            if np.isinf(col_to_normalize[0]):
+                return normalized_column
+            col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
+                input_column_dtype
+            )
+
             if not np.isnan(col_to_normalize_casted) and (
                 col_to_normalize_casted != col_to_normalize[0]
             ):

diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
@@ -398,8 +398,8 @@ def test_column_view_string_slice(slc):
             cudf.core.column.as_column([], dtype="uint8"),
         ),
         (
-            cp.array([453], dtype="uint8"),
-            cudf.core.column.as_column([453], dtype="uint8"),
+            cp.array([255], dtype="uint8"),
+            cudf.core.column.as_column([255], dtype="uint8"),
         ),
     ],
 )

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -151,7 +151,7 @@ def make_all_numeric_extremes_dataframe():
         if np.issubdtype(np_type, np.integer):
             itype = np.iinfo(np_type)
             extremes = [0, +1, -1, itype.min, itype.max]
-            df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20]
+            df[gdf_dtype] = np.array(extremes * 4).astype(np_type)[:20]
         else:
             ftype = np.finfo(np_type)
             extremes = [
@@ -324,6 +324,7 @@ def test_csv_reader_dtype_dict(use_names):
     assert_eq(gdf, pdf)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize("use_names", [True, False])
 def test_csv_reader_dtype_extremes(use_names):
     # Save with the column header if not explicitly specifying a list of names
@@ -1433,7 +1434,7 @@ def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype):
 
     gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
 
-    expected = np.array(values, dtype=np_dtype)
+    expected = np.array(values).astype(np_dtype)
     actual = gdf["hex_int"].to_numpy()
     np.testing.assert_array_equal(expected, actual)
 
@@ -2149,6 +2150,7 @@ def test_default_integer_bitwidth_partial(
     )
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 def test_default_integer_bitwidth_extremes(
     cudf_extreme_numeric_dataframe, default_integer_bitwidth
 ):

diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
@@ -15,23 +15,19 @@
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     types = NUMERIC_TYPES + ["bool"]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        }
     )
     # Delete the name of the column index, and rename the row index
     test_pdf.columns.name = None
     test_pdf.index.name = "index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     # Create non-numeric categorical data otherwise may get typecasted
     data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
@@ -31,23 +31,19 @@ def make_numeric_dataframe(nrows, dtype):
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        }
     )
     # Delete the name of the column index, and rename the row index
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     return test_pdf
 
 

diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -194,6 +194,7 @@ def test_to_numeric_downcast_int(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -223,6 +224,7 @@ def test_to_numeric_downcast_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -245,6 +247,7 @@ def test_to_numeric_downcast_large_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -325,6 +328,7 @@ def test_to_numeric_downcast_string_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -69,14 +69,14 @@ def simple_pdf(request):
         "float32",
         "float64",
     ]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        },
         # Need to ensure that this index is not a RangeIndex to get the
         # expected round-tripping behavior from Parquet reader/writer.
         index=pd.Index(list(range(nrows))),
@@ -85,10 +85,6 @@ def simple_pdf(request):
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     return test_pdf
 
 
@@ -115,14 +111,14 @@ def build_pdf(num_columns, day_resolution_timestamps):
         "datetime64[us]",
         "str",
     ]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = num_columns.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        },
         # Need to ensure that this index is not a RangeIndex to get the
         # expected round-tripping behavior from Parquet reader/writer.
         index=pd.Index(list(range(nrows))),
@@ -131,10 +127,6 @@ def build_pdf(num_columns, day_resolution_timestamps):
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype(typer)
-
     # make datetime64's a little more interesting by increasing the range of
     # dates note that pandas will convert these to ns timestamps, so care is
     # taken to avoid overflowing a ns timestamp. There is also the ability to

diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from itertools import chain, combinations_with_replacement, product
 
@@ -128,6 +128,7 @@ def test_rank_error_arguments(pdf):
 sort_dtype_args = [np.int32, np.int64, np.float32, np.float64]
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "elem,dtype",
     list(
@@ -139,13 +140,12 @@ def test_rank_error_arguments(pdf):
 )
 def test_series_rank_combinations(elem, dtype):
     np.random.seed(0)
+    aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype)
     gdf = DataFrame()
-    gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
-        dtype
-    )
-    ranked_gs = gdf["a"].rank(method="first")
     df = pd.DataFrame()
+    gdf["a"] = aa
     df["a"] = aa
+    ranked_gs = gdf["a"].rank(method="first")
     ranked_ps = df["a"].rank(method="first")
     # Check
-    assert_eq(ranked_ps, ranked_gs.to_pandas())
+    assert_eq(ranked_ps, ranked_gs)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
@@ -944,8 +944,15 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
     sr = cudf.from_pandas(psr)
 
+    if sr.dtype.kind in "ui":
+        can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
+            int(replacement), sr.dtype
+        )
+    else:
+        can_replace = np.can_cast(replacement, sr.dtype)
+
     # Both Scalar
-    if sr.dtype.type(replacement) != replacement:
+    if not can_replace:
         with pytest.raises(TypeError):
             sr.replace(1, replacement)
     else:
@@ -954,7 +961,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
         assert_eq(expect, got)
 
     # to_replace is a list, replacement is a scalar
-    if sr.dtype.type(replacement) != replacement:
+    if not can_replace:
         with pytest.raises(TypeError):
 
             sr.replace([2, 3], replacement)
@@ -974,7 +981,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     # Both lists of equal length
     if (
         np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"}
-    ) or (sr.dtype.type(replacement) != replacement):
+    ) or (not can_replace):
         with pytest.raises(TypeError):
             sr.replace([2, 3], [replacement, replacement])
     else:

diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -7,7 +7,7 @@
 
 def test_to_dense_array():
     data = np.random.random(8)
-    mask = np.asarray([0b11010110], dtype=np.byte)
+    mask = np.asarray([0b11010110]).astype(np.byte)
 
     sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
     assert sr.has_nulls

diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import itertools
 import operator
@@ -77,9 +77,10 @@ def generate_valid_scalar_unaop_combos():
     return results
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in scalar negative")
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
-    slr_host = cudf.dtype(dtype).type(slr)
+    slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
     slr_device = cudf.Scalar(slr, dtype=dtype)
 
     expect = op(slr_host)

diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
@@ -138,7 +138,8 @@ def query_compile(expr):
         key "args" is a sequence of name of the arguments.
     """
 
-    funcid = f"queryexpr_{np.uintp(hash(expr)):x}"
+    # hash returns in the semi-open interval [-2**63, 2**63)
+    funcid = f"queryexpr_{(hash(expr) + 2**63):x}"
     # Load cache
     compiled = _cache.get(funcid)
     # Cache not found

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cmake>=3.23.1,!=3.25.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "numpy>=1.21,<1.24",
+    "numpy>=1.21",
     "protoc-wheel",
     "pyarrow==11.0.0.*",
     "rmm==23.6.*",
@@ -32,7 +32,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numba>=0.57",
-    "numpy>=1.21,<1.24",
+    "numpy>=1.21",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=1.3,<1.6.0dev0",

diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
@@ -4,7 +4,7 @@
 
 requires = [
     "cython>=0.29,<0.30",
-    "numpy>=1.21,<1.24",
+    "numpy>=1.21",
     "pyarrow==11.0.0.*",
     "setuptools",
     "wheel",

diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "dask==2023.3.2",
     "distributed==2023.3.2.1",
     "fsspec>=0.6.0",
-    "numpy>=1.21,<1.24",
+    "numpy>=1.21",
     "pandas>=1.3,<1.6.0dev0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [