Skip to content

Commit

Permalink
Remove deprecated method Series.hash_encode. (#9942)
Browse files Browse the repository at this point in the history
This PR removes the deprecated method `Series.hash_encode`. Resolves #9475. Follows up on #9457, #9381.

This PR also removes libcudf code paths used solely for this Python method.

Users may replace code like `series.hash_encode(stop, use_name=False)` with `series.hash_values(method="murmur3") % stop`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Conor Hoekstra (https://github.com/codereport)

URL: #9942
  • Loading branch information
bdice authored Dec 23, 2021
1 parent 04f4219 commit c99a37f
Show file tree
Hide file tree
Showing 12 changed files with 46 additions and 234 deletions.
14 changes: 6 additions & 8 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,15 @@ namespace detail {
*/
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
cudf::host_span<uint32_t const> initial_hash = {},
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
hash_id hash_function = hash_id::HASH_MURMUR3,
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
cudf::host_span<uint32_t const> initial_hash = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> md5_hash(
table_view const& input,
Expand Down
9 changes: 3 additions & 6 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,16 @@ namespace cudf {
*
* @param input The table of columns to hash.
* @param hash_function The hash function enum to use.
* @param initial_hash Optional host_span of initial hash values for each column.
* If this span is empty then each element will be hashed as-is.
* @param seed Optional seed value to use for the hash function.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A column where each row is the hash of a column from the input.
*/
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
cudf::host_span<uint32_t const> initial_hash = {},
uint32_t seed = DEFAULT_HASH_SEED,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
hash_id hash_function = hash_id::HASH_MURMUR3,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
48 changes: 0 additions & 48 deletions cpp/include/cudf/table/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -539,52 +539,4 @@ class row_hasher {
uint32_t _seed{DEFAULT_HASH_SEED};
};

/**
* @brief Computes the hash value of a row in the given table, combined with an
* initial hash value for each column.
*
* @tparam hash_function Hash functor to use for hashing elements.
* @tparam Nullate A cudf::nullate type describing how to check for nulls.
*/
template <template <typename> class hash_function, typename Nullate>
class row_hasher_initial_values {
public:
row_hasher_initial_values() = delete;
row_hasher_initial_values(Nullate has_nulls, table_device_view t, hash_value_type* initial_hash)
: _table{t}, _initial_hash(initial_hash), _has_nulls{has_nulls}
{
}

__device__ auto operator()(size_type row_index) const
{
auto hash_combiner = [](hash_value_type lhs, hash_value_type rhs) {
return hash_function<hash_value_type>{}.hash_combine(lhs, rhs);
};

// Hashes an element in a column and combines with an initial value
auto hasher = [=](size_type column_index) {
auto hash_value = cudf::type_dispatcher<dispatch_storage_type>(
_table.column(column_index).type(),
element_hasher<hash_function, Nullate>{_has_nulls},
_table.column(column_index),
row_index);

return hash_combiner(_initial_hash[column_index], hash_value);
};

// Hash each element and combine all the hash values together
return thrust::transform_reduce(thrust::seq,
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_table.num_columns()),
hasher,
hash_value_type{0},
hash_combiner);
}

private:
table_device_view _table;
hash_value_type* _initial_hash;
Nullate _has_nulls;
};

} // namespace cudf
6 changes: 2 additions & 4 deletions cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,12 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
cudf::host_span<uint32_t const> initial_hash,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, stream, mr);
case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
case (hash_id::HASH_SERIAL_MURMUR3):
return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
Expand All @@ -108,12 +107,11 @@ std::unique_ptr<column> hash(table_view const& input,

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
cudf::host_span<uint32_t const> initial_hash,
uint32_t seed,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash(input, hash_function, initial_hash, seed, rmm::cuda_stream_default, mr);
return detail::hash(input, hash_function, seed, rmm::cuda_stream_default, mr);
}

} // namespace cudf
25 changes: 6 additions & 19 deletions cpp/src/hash/murmur_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ namespace cudf {
namespace detail {

std::unique_ptr<column> murmur_hash3_32(table_view const& input,
cudf::host_span<uint32_t const> initial_hash,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -44,24 +43,12 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
auto const device_input = table_device_view::create(input, stream);
auto output_view = output->mutable_view();

// Compute the hash value for each row depending on the specified hash function
if (!initial_hash.empty()) {
CUDF_EXPECTS(initial_hash.size() == size_t(input.num_columns()),
"Expected same size of initial hash values as number of columns");
auto device_initial_hash = make_device_uvector_async(initial_hash, stream);

thrust::tabulate(rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
row_hasher_initial_values<MurmurHash3_32, nullate::DYNAMIC>(
nullate::DYNAMIC{nullable}, *device_input, device_initial_hash.data()));
} else {
thrust::tabulate(
rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
}
// Compute the hash value for each row
thrust::tabulate(
rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));

return output;
}
Expand Down
50 changes: 25 additions & 25 deletions cpp/tests/hashing/hash_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,13 @@ TEST_F(HashTest, MultiValueNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

EXPECT_EQ(input1.num_rows(), spark_output1->size());
Expand All @@ -147,13 +147,13 @@ TYPED_TEST(HashTestTyped, Equality)
EXPECT_EQ(input.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3);

EXPECT_EQ(input.num_rows(), spark_output1->size());
Expand All @@ -177,13 +177,13 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

EXPECT_EQ(input1.num_rows(), spark_output1->size());
Expand Down Expand Up @@ -222,7 +222,7 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);

constexpr auto serial_hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const serial_col = cudf::hash(table_col, serial_hasher, {}, 0);
auto const serial_col = cudf::hash(table_col, serial_hasher, 0);
auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher);
auto const serial_col_neg_nan = cudf::hash(table_col_neg_nan, serial_hasher);

Expand All @@ -231,7 +231,7 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)

// Spark hash is sensitive to 0 and -0
constexpr auto spark_hasher = cudf::hash_id::HASH_SPARK_MURMUR3;
auto const spark_col = cudf::hash(table_col, spark_hasher, {}, 0);
auto const spark_col = cudf::hash(table_col, spark_hasher, 0);
auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
Expand Down Expand Up @@ -269,8 +269,8 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds)
auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2});

constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
auto const combo1_hash = cudf::hash(combo1, hasher, {});
auto const combo2_hash = cudf::hash(combo2, hasher, {});
auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {});
Expand Down Expand Up @@ -396,20 +396,20 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});

constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3;
auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42);
auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42);
auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42);
auto const hash_decimal64 = cudf::hash(cudf::table_view({decimal64_col}), hasher, {}, 42);
auto const hash_longs = cudf::hash(cudf::table_view({longs_col}), hasher, {}, 42);
auto const hash_floats = cudf::hash(cudf::table_view({floats_col}), hasher, {}, 42);
auto const hash_dates = cudf::hash(cudf::table_view({dates_col}), hasher, {}, 42);
auto const hash_decimal32 = cudf::hash(cudf::table_view({decimal32_col}), hasher, {}, 42);
auto const hash_ints = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
auto const hash_shorts = cudf::hash(cudf::table_view({shorts_col}), hasher, {}, 42);
auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, {}, 42);
auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42);
auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42);
auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, 42);
auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, 42);
auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, 42);
auto const hash_decimal64 = cudf::hash(cudf::table_view({decimal64_col}), hasher, 42);
auto const hash_longs = cudf::hash(cudf::table_view({longs_col}), hasher, 42);
auto const hash_floats = cudf::hash(cudf::table_view({floats_col}), hasher, 42);
auto const hash_dates = cudf::hash(cudf::table_view({dates_col}), hasher, 42);
auto const hash_decimal32 = cudf::hash(cudf::table_view({decimal32_col}), hasher, 42);
auto const hash_ints = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
auto const hash_shorts = cudf::hash(cudf::table_view({shorts_col}), hasher, 42);
auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, 42);
auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, 42);
auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, 42);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
Expand Down Expand Up @@ -439,7 +439,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
shorts_col,
bytes_col,
bools_col2});
auto const hash_combined = cudf::hash(combined_table, hasher, {}, 42);
auto const hash_combined = cudf::hash(combined_table, hasher, 42);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
}

Expand Down
2 changes: 0 additions & 2 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,4 @@ Serialization / IO / conversion
Series.from_categorical
Series.from_masked_array
Series.from_pandas
Series.hash_encode
Series.hash_values

3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/cpp/hash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ from cudf._lib.cpp.table.table_view cimport table_view
cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
cdef unique_ptr[column] hash "cudf::hash" (
const table_view& input,
const libcudf_types.hash_id& hash_function,
const vector[uint32_t]& initial_hash,
const libcudf_types.hash_id hash_function,
const uint32_t seed
) except +
4 changes: 1 addition & 3 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def hash_partition(source_table, object columns_to_hash,
)


def hash(source_table, str method, object initial_hash=None, int seed=0):
cdef vector[uint32_t] c_initial_hash = initial_hash or []
def hash(source_table, str method, int seed=0):
cdef table_view c_source_view = table_view_from_table(
source_table, ignore_index=True)
cdef unique_ptr[column] c_result
Expand All @@ -71,7 +70,6 @@ def hash(source_table, str method, object initial_hash=None, int seed=0):
cpp_hash(
c_source_view,
c_hash_function,
c_initial_hash,
seed
)
)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,8 +580,8 @@ def _gather(
result._copy_type_metadata(self)
return result

def _hash(self, method, initial_hash=None):
return libcudf.hash.hash(self, method, initial_hash)
def _hash(self, method):
return libcudf.hash.hash(self, method)

def _hash_partition(
self, columns_to_hash, num_partitions, keep_index=True
Expand Down
Loading

0 comments on commit c99a37f

Please sign in to comment.