Skip to content

Commit

Permalink
Use uint64 in hash_voca and clean up warnings
Browse files Browse the repository at this point in the history
Using uint8 fails because mixing it with Python integers try to
stick with uint8 on NumPy 2.
Using Python integers fails some NumPy 1.x paths, because mixing
them with uint64 scalars fails on NumPy 1.

uint64 should work on both versions.

Also moved a warnings ignore from the tests to the function
  • Loading branch information
seberg committed May 29, 2024
1 parent 1431cdb commit ce37898
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 18 deletions.
8 changes: 3 additions & 5 deletions python/cudf/cudf/tests/test_hash_vocab.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import filecmp
import os
import warnings
Expand All @@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):

groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
output_path = tmpdir.join("cudf-vocab-hash.txt")
with warnings.catch_warnings():
# See https://github.com/rapidsai/cudf/issues/12403
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
hash_vocab(vocab_path, output_path)

assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
3 changes: 2 additions & 1 deletion python/cudf/cudf/tests/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc):
if asc:
expected = np.argsort(sr.to_numpy(), kind="mergesort")
else:
expected = np.argsort(sr.to_numpy() * -1, kind="mergesort")
# -1 multiply works around missing desc sort (may promote to float64)
expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort")
np.testing.assert_array_equal(expected, res.to_numpy())


Expand Down
25 changes: 13 additions & 12 deletions python/cudf/cudf/utils/hash_vocab_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

# Coefficients ranges for inner hash - This are important to set to be
# large so that we have randomness in the bottom bits when modding
A_SECOND_LEVEL_POW = np.uint8(48)
B_SECOND_LEVEL_POW = np.uint8(7)
A_SECOND_LEVEL_POW = np.uint64(48)
B_SECOND_LEVEL_POW = np.uint64(7)

A_LBOUND_SECOND_LEVEL_HASH = 2**16
A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
Expand All @@ -23,11 +23,11 @@


# Shifts for bit packing
A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW)
B_SECOND_LEVEL_SHIFT_AMT = np.uint8(
A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
)
BITS_FOR_INNER_TABLE_SIZE = np.uint8(8)
BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)

NOT_FOUND = -1

Expand Down Expand Up @@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin):

while True:
a = np.random.randint(
A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
A_LBOUND_SECOND_LEVEL_HASH,
A_HBOUND_SECOND_LEVEL_HASH,
)
b = np.random.randint(
B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
Expand Down Expand Up @@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant):
bin_length = len(internal_table)
max_bin_length = max(bin_length, max_bin_length)
internal_table_coeffs[i] = (
coeff_a << A_SECOND_LEVEL_SHIFT_AMT
| coeff_b << B_SECOND_LEVEL_SHIFT_AMT
| bin_length
)
offset_into_flattened_table[i + 1] = (
offset_into_flattened_table[i] + bin_length
np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
| np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
| np.uint64(bin_length)
)
offset_into_flattened_table[i + 1] = offset_into_flattened_table[
i
] + np.uint64(bin_length)
flattened_bins.extend(internal_table)

print(
Expand Down

0 comments on commit ce37898

Please sign in to comment.