Skip to content

Commit

Permalink
Remove HASH_SERIAL_MURMUR3 / serial32BitMurmurHash3 (#11383)
Browse files Browse the repository at this point in the history
This PR closes #11296. While implementing Spark list hashing in #11292, I noticed that `HASH_SERIAL_MURMUR3` does not appear to be used except in tests. It is not exposed in Python. While it is exposed in the JNI bindings, it is not used by spark-rapids. I discussed this with @rwlee and it seems that this feature was added only for parallel design with the Spark serial hash implementation in #6781, which is superseded by #11292. We do not need to keep this vestigial feature.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - https://github.com/brandon-b-miller
  - David Wendt (https://github.com/davidwendt)
  - Jason Lowe (https://github.com/jlowe)

URL: #11383
  • Loading branch information
bdice authored Aug 1, 2022
1 parent a36c363 commit 1d4aa4a
Show file tree
Hide file tree
Showing 9 changed files with 6 additions and 267 deletions.
2 changes: 0 additions & 2 deletions cpp/benchmarks/hashing/hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,9 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls h
#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n)

HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls)
HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, nulls)
HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls)
HASH_BENCHMARK_DEFINE(HASH_MD5, nulls)

HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls)
HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, no_nulls)
HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls)
HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls)
7 changes: 0 additions & 7 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,6 @@ std::unique_ptr<column> spark_murmur_hash3_32(
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

template <template <typename> class hash_function>
std::unique_ptr<column> serial_murmur_hash3_32(
table_view const& input,
uint32_t seed = cudf::DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> md5_hash(
table_view const& input,
rmm::cuda_stream_view stream = cudf::default_stream_value,
Expand Down
9 changes: 4 additions & 5 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,10 @@ using hash_value_type = uint32_t; ///< Type of hash value
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function
HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function
HASH_MD5 ///< MD5 hash function
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function
HASH_MD5 ///< MD5 hash function
};

/**
Expand Down
39 changes: 0 additions & 39 deletions cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,43 +52,6 @@ std::vector<column_view> to_leaf_columns(IterType iter_begin, IterType iter_end)

} // namespace

template <template <typename> class hash_function>
std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto output = make_numeric_column(
data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);

if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

table_view const leaf_table(to_leaf_columns(input.begin(), input.end()));
auto const device_input = table_device_view::create(leaf_table, stream);
auto output_view = output->mutable_view();

thrust::tabulate(
rmm::exec_policy(stream),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
[device_input = *device_input, nulls = has_nulls(leaf_table), seed] __device__(auto row_index) {
return detail::accumulate(
device_input.begin(),
device_input.end(),
seed,
[row_index, nulls] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(),
experimental::row::hash::element_hasher<hash_function, nullate::DYNAMIC>{
nullate::DYNAMIC{nulls}, hash, hash},
column,
row_index);
});
});

return output;
}

std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
uint32_t seed,
Expand All @@ -97,8 +60,6 @@ std::unique_ptr<column> hash(table_view const& input,
{
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, seed, stream, mr);
case (hash_id::HASH_SERIAL_MURMUR3):
return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
case (hash_id::HASH_SPARK_MURMUR3): return spark_murmur_hash3_32(input, seed, stream, mr);
case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
default: CUDF_FAIL("Unsupported hash function.");
Expand Down
79 changes: 0 additions & 79 deletions cpp/tests/hashing/hash_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,6 @@ TEST_F(HashTest, MultiValueNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

Expand Down Expand Up @@ -371,12 +365,6 @@ TYPED_TEST(HashTestTyped, Equality)
EXPECT_EQ(input.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3);

Expand All @@ -401,12 +389,6 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
EXPECT_EQ(input1.num_rows(), output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());

auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);

EXPECT_EQ(input1.num_rows(), serial_output1->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());

auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);

Expand Down Expand Up @@ -445,14 +427,6 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);

constexpr auto serial_hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const serial_col = cudf::hash(table_col, serial_hasher, 0);
auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher);
auto const serial_col_neg_nan = cudf::hash(table_col_neg_nan, serial_hasher);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_zero, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_nan, verbosity);

// Spark hash is sensitive to 0 and -0
constexpr auto spark_hasher = cudf::hash_id::HASH_SPARK_MURMUR3;
auto const spark_col = cudf::hash(table_col, spark_hasher, 0);
Expand All @@ -461,59 +435,6 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
}

class SerialMurmurHash3Test : public cudf::test::BaseFixture {
};

TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds)
{
fixed_width_column_wrapper<int32_t> const strings_col_result(
{1467149710, -680899318, -1620282500, 91106683, -1564993834});
fixed_width_column_wrapper<int32_t> const ints_col_result(
{933211791, 751823303, -1080202046, 723455942, 133916647});

strings_column_wrapper const strings_col({"",
"The quick brown fox",
"jumps over the lazy dog.",
"All work and no play makes Jack a dull boy",
"!\"#$%&\'()*+,-./]:;<=>?@[\\]^_`{|}~\ud720\ud721"});

using limits = std::numeric_limits<int32_t>;
fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});

fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});

std::vector<std::unique_ptr<cudf::column>> struct_field_cols;
struct_field_cols.emplace_back(std::make_unique<cudf::column>(strings_col));
struct_field_cols.emplace_back(std::make_unique<cudf::column>(ints_col));
struct_field_cols.emplace_back(std::make_unique<cudf::column>(bools_col1));
structs_column_wrapper structs_col(std::move(struct_field_cols));

auto const combo1 = cudf::table_view({strings_col, ints_col, bools_col1});
auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2});

constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3;
auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
auto const combo1_hash = cudf::hash(combo1, hasher, {});
auto const combo2_hash = cudf::hash(combo2, hasher, {});
auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*strings_hash, strings_col_result, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ints_hash, ints_col_result, verbosity);
EXPECT_EQ(combo1.num_rows(), combo1_hash->size());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*combo1_hash, *combo2_hash, verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*structs_hash, *combo1_hash, verbosity);
}

TEST_F(SerialMurmurHash3Test, ListThrows)
{
lists_column_wrapper<cudf::string_view> strings_list_col({{""}, {"abc"}, {"123"}});
EXPECT_THROW(
cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SERIAL_MURMUR3, {}),
cudf::logic_error);
}

class SparkMurmurHash3Test : public cudf::test::BaseFixture {
};

Expand Down
34 changes: 0 additions & 34 deletions java/src/main/java/ai/rapids/cudf/ColumnVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -725,40 +725,6 @@ public static ColumnVector md5Hash(ColumnView... columns) {
return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), 0));
}

/**
* Create a new vector containing the murmur3 hash of each row in the table.
*
* @param seed integer seed for the murmur3 hash function
* @param columns array of columns to hash, must have identical number of rows.
* @return the new ColumnVector of 32-bit values representing each row's hash value.
*/
public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[]) {
if (columns.length < 1) {
throw new IllegalArgumentException("Murmur3 hashing requires at least 1 column of input");
}
long[] columnViews = new long[columns.length];
long size = columns[0].getRowCount();

for(int i = 0; i < columns.length; i++) {
assert columns[i] != null : "Column vectors passed may not be null";
assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported";
columnViews[i] = columns[i].getNativeView();
}
return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), seed));
}

/**
* Create a new vector containing the murmur3 hash of each row in the table, seed defaulted to 0.
*
* @param columns array of columns to hash, must have identical number of rows.
* @return the new ColumnVector of 32-bit values representing each row's hash value.
*/
public static ColumnVector serial32BitMurmurHash3(ColumnView columns[]) {
return serial32BitMurmurHash3(0, columns);
}

/**
* Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
* Spark's murmur3 hash uses a different tail processing algorithm.
Expand Down
5 changes: 2 additions & 3 deletions java/src/main/java/ai/rapids/cudf/HashType.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@
public enum HashType {
IDENTITY(0),
MURMUR3(1),
HASH_SERIAL_MURMUR3(2),
HASH_SPARK_MURMUR3(3),
HASH_MD5(4);
HASH_SPARK_MURMUR3(2),
HASH_MD5(3);

private static final HashType[] HASH_TYPES = HashType.values();
final int nativeId;
Expand Down
97 changes: 0 additions & 97 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -519,103 +519,6 @@ void testMD5HashLists() {
assertColumnsAreEqual(expected, result);
}
}
@Test
void testSerial32BitMurmur3HashStrings() {
try (ColumnVector v0 = ColumnVector.fromStrings(
"a", "B\nc", "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.A 60 character string to " +
"test MD5's message padding algorithm",
"hiJ\ud720\ud721\ud720\ud721", null);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(42, new ColumnVector[]{v0});
ColumnVector expected = ColumnVector.fromBoxedInts(-1293573533, 1163854319, 1423943036, 1504480835, 1249086584, 42)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashInts() {
try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(42, new ColumnVector[]{v0, v1});
ColumnVector expected = ColumnVector.fromBoxedInts(59727262, 751823303, -1080202046, 42, 723455942, 133916647)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashDoubles() {
try (ColumnVector v = ColumnVector.fromBoxedDoubles(
0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(new ColumnVector[]{v});
ColumnVector expected = ColumnVector.fromBoxedInts(1669671676, 0, -544903190, -1831674681, 150502665, 474144502, 1428788237, 1428788237, 1428788237, 1428788237, 420913893, 1915664072)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerialBitMurmur3HashFloats() {
try (ColumnVector v = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(411, new ColumnVector[]{v});
ColumnVector expected = ColumnVector.fromBoxedInts(-235179434, 1812056886, 2028471189, 1775092689, -1531511762, 411, -1053523253, -1053523253, -1053523253, -1053523253, -1526256646, 930080402)){
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashBools() {
try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(0, new ColumnVector[]{v0, v1});
ColumnVector expected = ColumnVector.fromBoxedInts(0, 884701402, 1032769583, -463810133, 1364076727, -991270669)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashMixed() {
try (ColumnVector strings = ColumnVector.fromStrings(
"a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.",
null, null);
ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
ColumnVector doubles = ColumnVector.fromBoxedDoubles(
0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
ColumnVector floats = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSerial32BitMurmur3HashStruct() {
try (ColumnVector strings = ColumnVector.fromStrings(
"a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
"A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
"in the MD5 hash function. This string needed to be longer.",
null, null);
ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
ColumnVector doubles = ColumnVector.fromBoxedDoubles(
0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
ColumnVector floats = ColumnVector.fromBoxedFloats(
0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) {
assertColumnsAreEqual(expected, result);
}
}

@Test
void testSpark32BitMurmur3HashStrings() {
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/cpp/hash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
ctypedef enum hash_id "cudf::hash_id":
HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3"
HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
HASH_MD5 "cudf::hash_id::HASH_MD5"

Expand Down

0 comments on commit 1d4aa4a

Please sign in to comment.