Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove deprecated method Series.hash_encode. #9942

Merged
merged 4 commits into from
Dec 23, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,15 @@ namespace detail {
*/
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
cudf::host_span<uint32_t const> initial_hash = {},
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
hash_id hash_function = hash_id::HASH_MURMUR3,
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> murmur_hash3_32(
table_view const& input,
cudf::host_span<uint32_t const> initial_hash = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> md5_hash(
table_view const& input,
Expand Down
9 changes: 3 additions & 6 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,16 @@ namespace cudf {
*
* @param input The table of columns to hash.
* @param hash_function The hash function enum to use.
* @param initial_hash Optional host_span of initial hash values for each column.
* If this span is empty then each element will be hashed as-is.
* @param seed Optional seed value to use for the hash function.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A column where each row is the hash of a column from the input.
*/
std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
cudf::host_span<uint32_t const> initial_hash = {},
uint32_t seed = DEFAULT_HASH_SEED,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
hash_id hash_function = hash_id::HASH_MURMUR3,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
48 changes: 0 additions & 48 deletions cpp/include/cudf/table/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -539,52 +539,4 @@ class row_hasher {
uint32_t _seed{DEFAULT_HASH_SEED};
};

/**
* @brief Computes the hash value of a row in the given table, combined with an
* initial hash value for each column.
*
* @tparam hash_function Hash functor to use for hashing elements.
* @tparam Nullate A cudf::nullate type describing how to check for nulls.
*/
template <template <typename> class hash_function, typename Nullate>
class row_hasher_initial_values {
public:
row_hasher_initial_values() = delete;
row_hasher_initial_values(Nullate has_nulls, table_device_view t, hash_value_type* initial_hash)
: _table{t}, _initial_hash(initial_hash), _has_nulls{has_nulls}
{
}

__device__ auto operator()(size_type row_index) const
{
auto hash_combiner = [](hash_value_type lhs, hash_value_type rhs) {
return hash_function<hash_value_type>{}.hash_combine(lhs, rhs);
};

// Hashes an element in a column and combines with an initial value
auto hasher = [=](size_type column_index) {
auto hash_value = cudf::type_dispatcher<dispatch_storage_type>(
_table.column(column_index).type(),
element_hasher<hash_function, Nullate>{_has_nulls},
_table.column(column_index),
row_index);

return hash_combiner(_initial_hash[column_index], hash_value);
};

// Hash each element and combine all the hash values together
return thrust::transform_reduce(thrust::seq,
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_table.num_columns()),
hasher,
hash_value_type{0},
hash_combiner);
}

private:
table_device_view _table;
hash_value_type* _initial_hash;
Nullate _has_nulls;
};

} // namespace cudf
6 changes: 2 additions & 4 deletions cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,12 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
cudf::host_span<uint32_t const> initial_hash,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, stream, mr);
case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
case (hash_id::HASH_SERIAL_MURMUR3):
return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
Expand All @@ -108,12 +107,11 @@ std::unique_ptr<column> hash(table_view const& input,

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
cudf::host_span<uint32_t const> initial_hash,
uint32_t seed,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash(input, hash_function, initial_hash, seed, rmm::cuda_stream_default, mr);
return detail::hash(input, hash_function, seed, rmm::cuda_stream_default, mr);
}

} // namespace cudf
25 changes: 6 additions & 19 deletions cpp/src/hash/murmur_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ namespace cudf {
namespace detail {

std::unique_ptr<column> murmur_hash3_32(table_view const& input,
cudf::host_span<uint32_t const> initial_hash,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -44,24 +43,12 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
auto const device_input = table_device_view::create(input, stream);
auto output_view = output->mutable_view();

// Compute the hash value for each row depending on the specified hash function
if (!initial_hash.empty()) {
CUDF_EXPECTS(initial_hash.size() == size_t(input.num_columns()),
"Expected same size of initial hash values as number of columns");
auto device_initial_hash = make_device_uvector_async(initial_hash, stream);

thrust::tabulate(rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
row_hasher_initial_values<MurmurHash3_32, nullate::DYNAMIC>(
nullate::DYNAMIC{nullable}, *device_input, device_initial_hash.data()));
} else {
thrust::tabulate(
rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
}
// Compute the hash value for each row
thrust::tabulate(
rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));

return output;
}
Expand Down
50 changes: 25 additions & 25 deletions cpp/tests/hashing/hash_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,13 @@ TEST_F(HashTest, MultiValueNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

EXPECT_EQ(input1.num_rows(), spark_output1->size());
Expand All @@ -147,13 +147,13 @@ TYPED_TEST(HashTestTyped, Equality)
EXPECT_EQ(input.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3);

EXPECT_EQ(input.num_rows(), spark_output1->size());
Expand All @@ -177,13 +177,13 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

EXPECT_EQ(input1.num_rows(), spark_output1->size());
Expand Down Expand Up @@ -222,7 +222,7 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);

constexpr auto serial_hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const serial_col = cudf::hash(table_col, serial_hasher, {}, 0);
auto const serial_col = cudf::hash(table_col, serial_hasher, 0);
auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher);
auto const serial_col_neg_nan = cudf::hash(table_col_neg_nan, serial_hasher);

Expand All @@ -231,7 +231,7 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)

// Spark hash is sensitive to 0 and -0
constexpr auto spark_hasher = cudf::hash_id::HASH_SPARK_MURMUR3;
auto const spark_col = cudf::hash(table_col, spark_hasher, {}, 0);
auto const spark_col = cudf::hash(table_col, spark_hasher, 0);
auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
Expand Down Expand Up @@ -269,8 +269,8 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds)
auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2});

constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
auto const combo1_hash = cudf::hash(combo1, hasher, {});
auto const combo2_hash = cudf::hash(combo2, hasher, {});
auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {});
Expand Down Expand Up @@ -396,20 +396,20 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});

constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3;
auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42);
auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42);
auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42);
auto const hash_decimal64 = cudf::hash(cudf::table_view({decimal64_col}), hasher, {}, 42);
auto const hash_longs = cudf::hash(cudf::table_view({longs_col}), hasher, {}, 42);
auto const hash_floats = cudf::hash(cudf::table_view({floats_col}), hasher, {}, 42);
auto const hash_dates = cudf::hash(cudf::table_view({dates_col}), hasher, {}, 42);
auto const hash_decimal32 = cudf::hash(cudf::table_view({decimal32_col}), hasher, {}, 42);
auto const hash_ints = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
auto const hash_shorts = cudf::hash(cudf::table_view({shorts_col}), hasher, {}, 42);
auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, {}, 42);
auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42);
auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42);
auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, 42);
auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, 42);
auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, 42);
auto const hash_decimal64 = cudf::hash(cudf::table_view({decimal64_col}), hasher, 42);
auto const hash_longs = cudf::hash(cudf::table_view({longs_col}), hasher, 42);
auto const hash_floats = cudf::hash(cudf::table_view({floats_col}), hasher, 42);
auto const hash_dates = cudf::hash(cudf::table_view({dates_col}), hasher, 42);
auto const hash_decimal32 = cudf::hash(cudf::table_view({decimal32_col}), hasher, 42);
auto const hash_ints = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
auto const hash_shorts = cudf::hash(cudf::table_view({shorts_col}), hasher, 42);
auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, 42);
auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, 42);
auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, 42);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
Expand Down Expand Up @@ -439,7 +439,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
shorts_col,
bytes_col,
bools_col2});
auto const hash_combined = cudf::hash(combined_table, hasher, {}, 42);
auto const hash_combined = cudf::hash(combined_table, hasher, 42);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
}

Expand Down
2 changes: 0 additions & 2 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,4 @@ Serialization / IO / conversion
Series.from_categorical
Series.from_masked_array
Series.from_pandas
Series.hash_encode
Series.hash_values

3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/cpp/hash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ from cudf._lib.cpp.table.table_view cimport table_view
cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
cdef unique_ptr[column] hash "cudf::hash" (
const table_view& input,
const libcudf_types.hash_id& hash_function,
const vector[uint32_t]& initial_hash,
const libcudf_types.hash_id hash_function,
const uint32_t seed
) except +
4 changes: 1 addition & 3 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def hash_partition(source_table, object columns_to_hash,
)


def hash(source_table, str method, object initial_hash=None, int seed=0):
cdef vector[uint32_t] c_initial_hash = initial_hash or []
def hash(source_table, str method, int seed=0):
cdef table_view c_source_view = table_view_from_table(
source_table, ignore_index=True)
cdef unique_ptr[column] c_result
Expand All @@ -71,7 +70,6 @@ def hash(source_table, str method, object initial_hash=None, int seed=0):
cpp_hash(
c_source_view,
c_hash_function,
c_initial_hash,
seed
)
)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,8 +580,8 @@ def _gather(
result._copy_type_metadata(self)
return result

def _hash(self, method, initial_hash=None):
return libcudf.hash.hash(self, method, initial_hash)
def _hash(self, method):
return libcudf.hash.hash(self, method)

def _hash_partition(
self, columns_to_hash, num_partitions, keep_index=True
Expand Down
Loading