From 2b1ca4839153fa692bcdc647a49c6cd3609f58d1 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 29 May 2024 12:06:19 +0200 Subject: [PATCH 1/5] MAINT: Avoid `numpy.array` with `copy=False` (meaning changed) --- python/cudf/cudf/core/buffer/buffer.py | 4 ++-- python/cudf/cudf/core/buffer/spillable_buffer.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 5c2d77033b8..bf6f9f1a3c1 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -191,7 +191,7 @@ def from_host_memory(cls, data: Any) -> Self: """Create an owner from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.array()` + be convertible to a buffer object using `numpy.asanyarray()` The host memory is copied to a new device allocation. @@ -209,7 +209,7 @@ def from_host_memory(cls, data: Any) -> Self: """ # Convert to numpy array, this will not copy data in most cases. - ary = numpy.array(data, copy=False, subok=True) + ary = numpy.asanyarray(data) # Extract pointer and size ptr, size = get_ptr_and_size(ary.__array_interface__) # Copy to device memory diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index a1af3ba8c9d..49258fea9ab 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -146,7 +146,7 @@ def from_host_memory(cls, data: Any) -> Self: """Create a spillabe buffer from host memory. Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.array()` + be convertible to a buffer object using `numpy.asanyarray()` The new buffer is marked as spilled to host memory already. @@ -165,7 +165,7 @@ def from_host_memory(cls, data: Any) -> Self: # Convert to a memoryview using numpy array, this will not copy data # in most cases. - data = memoryview(numpy.array(data, copy=False, subok=True)) + data = memoryview(numpy.asanyarray(data)) if not data.c_contiguous: raise ValueError("Buffer data must be C-contiguous") data = data.cast("B") # Make sure itemsize==1 From 956d99077fe6964584ef853e7d2ff70478900e0c Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 29 May 2024 12:07:29 +0200 Subject: [PATCH 2/5] TST: Use `float()` to avoid NumPy 2 overflowing the float32 --- python/cudf/cudf/tests/test_numerical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 2e3be92dbeb..03081208739 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -44,7 +44,7 @@ def test_can_cast_safely_same_kind(): assert data.can_cast_safely(to_dtype) data = cudf.Series( - [np.finfo("float32").max * 2, 1.0], dtype="float64" + [float(np.finfo("float32").max) * 2, 1.0], dtype="float64" )._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) From eddc42fd959455885b9ed5d1746eeb6f6b13ab0f Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 29 May 2024 12:10:27 +0200 Subject: [PATCH 3/5] TST: Parametrize and fortify to NumPy 2 raising for e.g. `uint8(-1)` --- python/cudf/cudf/tests/test_replace.py | 43 +++++++------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 8992c4d617b..88102b8aaaf 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +import operator import re from decimal import Decimal @@ -825,43 +826,23 @@ def test_series_fillna_invalid_dtype(data_dtype): @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) @pytest.mark.parametrize("fill_value", [100, 100.0, 128.5]) -def test_series_where(data_dtype, fill_value): +@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt]) +def test_series_where(data_dtype, fill_value, op): psr = pd.Series(list(range(10)), dtype=data_dtype) sr = cudf.from_pandas(psr) - if sr.dtype.type(fill_value) != fill_value: - with pytest.raises(TypeError): - sr.where(sr > 0, fill_value) - else: - # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value) - got = sr.where(sr > 0, fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) + try: + scalar_fits = sr.dtype.type(fill_value) == fill_value + except OverflowError: + scalar_fits = False - if sr.dtype.type(fill_value) != fill_value: + if not scalar_fits: with pytest.raises(TypeError): - sr.where(sr < 0, fill_value) + sr.where(op(sr, 0), fill_value) else: - expect = psr.where(psr < 0, fill_value) - got = sr.where(sr < 0, fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) - - if sr.dtype.type(fill_value) != fill_value: - with pytest.raises(TypeError): - sr.where(sr == 0, fill_value) - else: - expect = psr.where(psr == 0, fill_value) - got = sr.where(sr == 0, fill_value) + # Cast back to original dtype as pandas automatically upcasts + expect = psr.where(op(psr, 0), fill_value) + got = sr.where(op(sr, 0), fill_value) # pandas returns 'float16' dtype, which is not supported in cudf assert_eq( expect, From 1431cdbd91d2e759f9c701d2b3521258d6f426bb Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 29 May 2024 12:11:11 +0200 Subject: [PATCH 4/5] TST: Fortify against NumPy 2 can_cast not being valid for Python ints --- python/cudf/cudf/tests/test_replace.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 88102b8aaaf..d77ec596271 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -966,12 +966,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) - if sr.dtype.kind in "ui": - can_replace = np.array([replacement])[0].is_integer() and np.can_cast( - int(replacement), sr.dtype - ) - else: - can_replace = np.can_cast(replacement, sr.dtype) + numpy_replacement = np.array(replacement).astype(sr.dtype)[()] + can_replace = numpy_replacement == replacement # Both Scalar if not can_replace: From ce37898db1450b333b1e0cc75fb7e9a912afe641 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 29 May 2024 12:14:15 +0200 Subject: [PATCH 5/5] Use uint64 in hash_voca and clean up warnings Using uint8 fails because mixing it with Python integers try to stick with uint8 on NumPy 2. Using Python integers fails some NumPy 1.x paths, because mixing them with uint64 scalars fails on NumPy 1. uint64 should work on both versions. Also moved a warnings ignore from the tests to the function --- python/cudf/cudf/tests/test_hash_vocab.py | 8 +++---- python/cudf/cudf/tests/test_sorting.py | 3 ++- python/cudf/cudf/utils/hash_vocab_utils.py | 25 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index e081119ff89..c98b92f7083 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import filecmp import os import warnings @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir): groundtruth_path = os.path.join(datadir, "vocab-hash.txt") output_path = tmpdir.join("cudf-vocab-hash.txt") - with warnings.catch_warnings(): - # See https://github.com/rapidsai/cudf/issues/12403 - warnings.simplefilter(action="ignore", category=RuntimeWarning) - hash_vocab(vocab_path, output_path) + warnings.simplefilter(action="ignore", category=RuntimeWarning) + hash_vocab(vocab_path, output_path) assert filecmp.cmp(output_path, groundtruth_path, shallow=False) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 618c4f30bd9..449f21721f4 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc): if asc: expected = np.argsort(sr.to_numpy(), kind="mergesort") else: - expected = np.argsort(sr.to_numpy() * -1, kind="mergesort") + # -1 multiply works around missing desc sort (may promote to float64) + expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") np.testing.assert_array_equal(expected, res.to_numpy()) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index ef078ed8c5d..babe4be2715 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -7,8 +7,8 @@ # Coefficients ranges for inner hash - This are important to set to be # large so that we have randomness in the bottom bits when modding -A_SECOND_LEVEL_POW = np.uint8(48) -B_SECOND_LEVEL_POW = np.uint8(7) +A_SECOND_LEVEL_POW = np.uint64(48) +B_SECOND_LEVEL_POW = np.uint64(7) A_LBOUND_SECOND_LEVEL_HASH = 2**16 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW @@ -23,11 +23,11 @@ # Shifts for bit packing -A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint8( +A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW) +B_SECOND_LEVEL_SHIFT_AMT = np.uint64( 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW ) -BITS_FOR_INNER_TABLE_SIZE = np.uint8(8) +BITS_FOR_INNER_TABLE_SIZE = np.uint64(8) NOT_FOUND = -1 @@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin): while True: a = np.random.randint( - A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH + A_LBOUND_SECOND_LEVEL_HASH, + A_HBOUND_SECOND_LEVEL_HASH, ) b = np.random.randint( B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH @@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant): bin_length = len(internal_table) max_bin_length = max(bin_length, max_bin_length) internal_table_coeffs[i] = ( - coeff_a << A_SECOND_LEVEL_SHIFT_AMT - | coeff_b << B_SECOND_LEVEL_SHIFT_AMT - | bin_length - ) - offset_into_flattened_table[i + 1] = ( - offset_into_flattened_table[i] + bin_length + np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT + | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT + | np.uint64(bin_length) ) + offset_into_flattened_table[i + 1] = offset_into_flattened_table[ + i + ] + np.uint64(bin_length) flattened_bins.extend(internal_table) print(