diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index e081119ff89..c98b92f7083 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import filecmp import os import warnings @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir): groundtruth_path = os.path.join(datadir, "vocab-hash.txt") output_path = tmpdir.join("cudf-vocab-hash.txt") - with warnings.catch_warnings(): - # See https://github.com/rapidsai/cudf/issues/12403 - warnings.simplefilter(action="ignore", category=RuntimeWarning) - hash_vocab(vocab_path, output_path) + warnings.simplefilter(action="ignore", category=RuntimeWarning) + hash_vocab(vocab_path, output_path) assert filecmp.cmp(output_path, groundtruth_path, shallow=False) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 618c4f30bd9..449f21721f4 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc): if asc: expected = np.argsort(sr.to_numpy(), kind="mergesort") else: - expected = np.argsort(sr.to_numpy() * -1, kind="mergesort") + # -1 multiply works around missing desc sort (may promote to float64) + expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") np.testing.assert_array_equal(expected, res.to_numpy()) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index ef078ed8c5d..babe4be2715 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -7,8 +7,8 @@ # Coefficients ranges for inner hash - This are important to set to be # large so that we have randomness in the bottom bits when modding -A_SECOND_LEVEL_POW = np.uint8(48) -B_SECOND_LEVEL_POW = np.uint8(7) +A_SECOND_LEVEL_POW = np.uint64(48) +B_SECOND_LEVEL_POW = np.uint64(7) A_LBOUND_SECOND_LEVEL_HASH = 2**16 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW @@ -23,11 +23,11 @@ # Shifts for bit packing -A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint8( +A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW) +B_SECOND_LEVEL_SHIFT_AMT = np.uint64( 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW ) -BITS_FOR_INNER_TABLE_SIZE = np.uint8(8) +BITS_FOR_INNER_TABLE_SIZE = np.uint64(8) NOT_FOUND = -1 @@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin): while True: a = np.random.randint( - A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH + A_LBOUND_SECOND_LEVEL_HASH, + A_HBOUND_SECOND_LEVEL_HASH, ) b = np.random.randint( B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH @@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant): bin_length = len(internal_table) max_bin_length = max(bin_length, max_bin_length) internal_table_coeffs[i] = ( - coeff_a << A_SECOND_LEVEL_SHIFT_AMT - | coeff_b << B_SECOND_LEVEL_SHIFT_AMT - | bin_length - ) - offset_into_flattened_table[i + 1] = ( - offset_into_flattened_table[i] + bin_length + np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT + | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT + | np.uint64(bin_length) ) + offset_into_flattened_table[i + 1] = offset_into_flattened_table[ + i + ] + np.uint64(bin_length) flattened_bins.extend(internal_table) print(