Use uint64 in hash_voca and clean up warnings

Using uint8 fails because mixing it with Python integers try to stick with uint8 on NumPy 2. Using Python integers fails some NumPy 1.x paths, because mixing them with uint64 scalars fails on NumPy 1. uint64 should work on both versions. Also moved a warnings ignore from the tests to the function
rapidsai · May 29, 2024 · ce37898 · ce37898
1 parent 1431cdb
commit ce37898
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 18 deletions.
diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import filecmp
 import os
 import warnings
@@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):
 
     groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
-    with warnings.catch_warnings():
-        # See https://github.com/rapidsai/cudf/issues/12403
-        warnings.simplefilter(action="ignore", category=RuntimeWarning)
-        hash_vocab(vocab_path, output_path)
+    warnings.simplefilter(action="ignore", category=RuntimeWarning)
+    hash_vocab(vocab_path, output_path)
 
     assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
@@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc):
     if asc:
         expected = np.argsort(sr.to_numpy(), kind="mergesort")
     else:
-        expected = np.argsort(sr.to_numpy() * -1, kind="mergesort")
+        # -1 multiply works around missing desc sort (may promote to float64)
+        expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort")
     np.testing.assert_array_equal(expected, res.to_numpy())
 
 

diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -7,8 +7,8 @@
 
 # Coefficients ranges for inner hash - This are important to set to be
 # large so that we have randomness in the bottom bits when modding
-A_SECOND_LEVEL_POW = np.uint8(48)
-B_SECOND_LEVEL_POW = np.uint8(7)
+A_SECOND_LEVEL_POW = np.uint64(48)
+B_SECOND_LEVEL_POW = np.uint64(7)
 
 A_LBOUND_SECOND_LEVEL_HASH = 2**16
 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
@@ -23,11 +23,11 @@
 
 
 # Shifts for bit packing
-A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW)
-B_SECOND_LEVEL_SHIFT_AMT = np.uint8(
+A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
+B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
     64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
 )
-BITS_FOR_INNER_TABLE_SIZE = np.uint8(8)
+BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)
 
 NOT_FOUND = -1
 
@@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin):
 
     while True:
         a = np.random.randint(
-            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
+            A_LBOUND_SECOND_LEVEL_HASH,
+            A_HBOUND_SECOND_LEVEL_HASH,
         )
         b = np.random.randint(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
@@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant):
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
-            coeff_a << A_SECOND_LEVEL_SHIFT_AMT
-            | coeff_b << B_SECOND_LEVEL_SHIFT_AMT
-            | bin_length
-        )
-        offset_into_flattened_table[i + 1] = (
-            offset_into_flattened_table[i] + bin_length
+            np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(bin_length)
         )
+        offset_into_flattened_table[i + 1] = offset_into_flattened_table[
+            i
+        ] + np.uint64(bin_length)
         flattened_bins.extend(internal_table)
 
     print(