rapidsai · rapids-bot · May 25, 2023 · May 8, 2023 · May 9, 2023 · May 10, 2023
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -85,6 +85,7 @@
 
 try:
     from cubinlinker.patch import patch_numba_linker_if_needed
+    from ptxcompiler.patch import patch_numba_codegen_if_needed
 except ImportError:
     pass
 else:
@@ -96,6 +97,7 @@
 
     _setup_numba_linker(_PTX_FILE)
 
+    patch_numba_codegen_if_needed()
     del patch_numba_linker_if_needed
 
 cuda.set_memory_manager(RMMNumbaManager)

@@ -766,10 +766,12 @@ def _normalize_find_and_replace_input(
         if len(col_to_normalize) == 1:
             if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
                 return normalized_column.astype(input_column_dtype)
-            else:
-                col_to_normalize_casted = input_column_dtype.type(
-                    col_to_normalize[0]
-                )
+            if np.isinf(col_to_normalize[0]):
+                return normalized_column
+            col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
+                input_column_dtype
+            )
+
             if not np.isnan(col_to_normalize_casted) and (
                 col_to_normalize_casted != col_to_normalize[0]
             ):

@@ -398,8 +398,8 @@ def test_column_view_string_slice(slc):
             cudf.core.column.as_column([], dtype="uint8"),
         ),
         (
-            cp.array([453], dtype="uint8"),
-            cudf.core.column.as_column([453], dtype="uint8"),
+            cp.array([255], dtype="uint8"),
+            cudf.core.column.as_column([255], dtype="uint8"),
         ),
     ],
 )

@@ -150,8 +150,8 @@ def make_all_numeric_extremes_dataframe():
         np_type = pdf_dtypes[gdf_dtype]
         if np.issubdtype(np_type, np.integer):
             itype = np.iinfo(np_type)
-            extremes = [0, +1, -1, itype.min, itype.max]
-            df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20]
+            extremes = [itype.min, itype.max]
+            df[gdf_dtype] = np.array(extremes * 10, dtype=np_type)[:20]
         else:
             ftype = np.finfo(np_type)
             extremes = [
@@ -1433,7 +1433,7 @@ def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype):
 
     gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
 
-    expected = np.array(values, dtype=np_dtype)
+    expected = np.array(values).astype(np_dtype)
     actual = gdf["hex_int"].to_numpy()
     np.testing.assert_array_equal(expected, actual)
 

@@ -15,23 +15,16 @@
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     types = NUMERIC_TYPES + ["bool"]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {f"col_{typ}": np.random.randint(0, nrows, nrows) for typ in types}
     )
     # Delete the name of the column index, and rename the row index
     test_pdf.columns.name = None
     test_pdf.index.name = "index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     # Create non-numeric categorical data otherwise may get typecasted
     data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")

@@ -32,13 +32,11 @@ def make_numeric_dataframe(nrows, dtype):
 def pdf(request):
     types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"]
     typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {f"col_{typ}": np.random.randint(0, nrows, nrows) for typ in types}
     )
     # Delete the name of the column index, and rename the row index
     test_pdf.columns.name = None

@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -194,6 +194,7 @@ def test_to_numeric_downcast_int(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -223,6 +224,7 @@ def test_to_numeric_downcast_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -245,6 +247,7 @@ def test_to_numeric_downcast_large_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -325,6 +328,7 @@ def test_to_numeric_downcast_string_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [

@@ -69,14 +69,11 @@ def simple_pdf(request):
         "float32",
         "float64",
     ]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {f"col_{typ}": np.random.randint(0, nrows, nrows) for typ in types},
         # Need to ensure that this index is not a RangeIndex to get the
         # expected round-tripping behavior from Parquet reader/writer.
         index=pd.Index(list(range(nrows))),
@@ -85,10 +82,6 @@ def simple_pdf(request):
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     return test_pdf
 
 
@@ -115,13 +108,11 @@ def build_pdf(num_columns, day_resolution_timestamps):
         "datetime64[us]",
         "str",
     ]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = num_columns.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
+        {f"col_{typ}": np.random.randint(0, nrows, nrows) for typ in types},
         columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
         # Need to ensure that this index is not a RangeIndex to get the
         # expected round-tripping behavior from Parquet reader/writer.
@@ -131,10 +122,6 @@ def build_pdf(num_columns, day_resolution_timestamps):
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype(typer)
-
     # make datetime64's a little more interesting by increasing the range of
     # dates note that pandas will convert these to ns timestamps, so care is
     # taken to avoid overflowing a ns timestamp. There is also the ability to

@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from itertools import chain, combinations_with_replacement, product
 
@@ -125,7 +125,7 @@ def test_rank_error_arguments(pdf):
     np.full((3,), np.inf),
     np.full((3,), -np.inf),
 ]
-sort_dtype_args = [np.int32, np.int64, np.float32, np.float64]
+sort_dtype_args = [np.float32, np.float64]
 
 
 @pytest.mark.parametrize(
@@ -139,13 +139,12 @@ def test_rank_error_arguments(pdf):
 )
 def test_series_rank_combinations(elem, dtype):
     np.random.seed(0)
+    aa = np.fromiter(chain.from_iterable(elem), dtype=dtype)
     gdf = DataFrame()
-    gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
-        dtype
-    )
-    ranked_gs = gdf["a"].rank(method="first")
     df = pd.DataFrame()
+    gdf["a"] = aa
     df["a"] = aa
+    ranked_gs = gdf["a"].rank(method="first")
     ranked_ps = df["a"].rank(method="first")
     # Check
-    assert_eq(ranked_ps, ranked_gs.to_pandas())
+    assert_eq(ranked_ps, ranked_gs)
@@ -944,8 +944,15 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
     sr = cudf.from_pandas(psr)
 
+    if sr.dtype.kind in "ui":
+        can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
+            int(replacement), sr.dtype
+        )
+    else:
+        can_replace = np.can_cast(replacement, sr.dtype)
+
     # Both Scalar
-    if sr.dtype.type(replacement) != replacement:
+    if not can_replace:
         with pytest.raises(TypeError):
             sr.replace(1, replacement)
     else:
@@ -954,7 +961,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
         assert_eq(expect, got)
 
     # to_replace is a list, replacement is a scalar
-    if sr.dtype.type(replacement) != replacement:
+    if not can_replace:
         with pytest.raises(TypeError):
 
             sr.replace([2, 3], replacement)
@@ -974,7 +981,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     # Both lists of equal length
     if (
         np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"}
-    ) or (sr.dtype.type(replacement) != replacement):
+    ) or (not can_replace):
         with pytest.raises(TypeError):
             sr.replace([2, 3], [replacement, replacement])
     else:

@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -7,7 +7,7 @@
 
 def test_to_dense_array():
     data = np.random.random(8)
-    mask = np.asarray([0b11010110], dtype=np.byte)
+    mask = np.asarray([0b11010110]).astype(np.byte)
 
     sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
     assert sr.has_nulls

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import itertools
 import operator
@@ -79,9 +79,13 @@ def generate_valid_scalar_unaop_combos():
 
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
-    slr_host = cudf.dtype(dtype).type(slr)
+    slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
     slr_device = cudf.Scalar(slr, dtype=dtype)
 
+    if op.__name__ == "neg" and np.dtype(dtype).kind == "u":
+        # TODO: what do we want to do here?
+        return
+
     expect = op(slr_host)
     got = op(slr_device)
 

@@ -137,7 +137,7 @@ def query_compile(expr):
         key "args" is a sequence of name of the arguments.
     """
 
-    funcid = f"queryexpr_{np.uintp(hash(expr)):x}"
+    funcid = f"queryexpr_{np.uintp(abs(hash(expr))):x}"
-    funcid = f"queryexpr_{np.uintp(abs(hash(expr))):x}"
+    funcid = f"queryexpr_{hash(expr) + 2**63:x}"
-    funcid = f"queryexpr_{np.uintp(abs(hash(expr))):x}"
+    funcid = f"queryexpr_{hash(expr) + 2**63:x}"
     # Load cache
     compiled = _cache.get(funcid)
     # Cache not found