From d01cc1ca6c0b4966322e2ca822d0b04b42f55f13 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Mon, 22 Mar 2021 15:43:34 -0500 Subject: [PATCH] Fix SparkMurmurHash3_32 hash inconsistencies with Apache Spark --- .../cudf/detail/utilities/hash_functions.cuh | 34 +++- cpp/tests/hashing/hash_test.cpp | 157 ++++++++++++++---- .../java/ai/rapids/cudf/ColumnVector.java | 1 - .../java/ai/rapids/cudf/ColumnVectorTest.java | 20 +++ 4 files changed, 174 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index e9d66d125dd..a6f24240c0c 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -570,9 +570,7 @@ struct SparkMurmurHash3_32 { template ::value>* = nullptr> hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const { - if (key == T{0.0}) { - return compute(T{0.0}); - } else if (isnan(key)) { + if (isnan(key)) { T nan = std::numeric_limits::quiet_NaN(); return compute(nan); } else { @@ -630,6 +628,34 @@ hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32::operator()(bool return this->compute(key); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(int8_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(uint8_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(int16_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(uint16_t const& key) const +{ + return this->compute(key); +} + /** * @brief Specialization of MurmurHash3_32 operator for strings. */ diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index f0d13390edf..36049436732 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -201,27 +201,37 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes) T nan = std::numeric_limits::quiet_NaN(); T inf = std::numeric_limits::infinity(); - fixed_width_column_wrapper const col1({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); - fixed_width_column_wrapper const col2( - {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + fixed_width_column_wrapper const col({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + fixed_width_column_wrapper const col_neg_zero( + {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + fixed_width_column_wrapper const col_neg_nan( + {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + auto const table_col = cudf::table_view({col}); + auto const table_col_neg_zero = cudf::table_view({col_neg_zero}); + auto const table_col_neg_nan = cudf::table_view({col_neg_nan}); - auto const output1 = cudf::hash(input1); - auto const output2 = cudf::hash(input2); + auto const hash_col = cudf::hash(table_col); + auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero); + auto const hash_col_neg_nan = cudf::hash(table_col_neg_nan); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, true); - auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0); - auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3); + constexpr auto serial_hasher = cudf::hash_id::HASH_SERIAL_MURMUR3; + auto const serial_col = cudf::hash(table_col, serial_hasher, {}, 0); + auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher); + auto const serial_col_neg_nan = cudf::hash(table_col_neg_nan, serial_hasher); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_zero, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_nan, true); - auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0); - auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3); + // Spark hash is sensitive to 0 and -0 + constexpr auto spark_hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const spark_col = cudf::hash(table_col, spark_hasher, {}, 0); + auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan); } class SerialMurmurHash3Test : public cudf::test::BaseFixture { @@ -267,10 +277,53 @@ class SparkMurmurHash3Test : public cudf::test::BaseFixture { TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) { - fixed_width_column_wrapper const strings_col_result( + // The hash values were determined by running the following Scala code in Apache Spark: + // import org.apache.spark.sql.catalyst.util.DateTimeUtils + // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType) + // .add("timestamps",TimestampType).add("longs",LongType).add("floats",FloatType) + // .add("dates",DateType).add("ints",IntegerType).add("shorts",ShortType) + // .add("bytes",ByteType).add("bools",BooleanType) + // val data = Seq( + // Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), 0.toLong, 0.toFloat, + // DateTimeUtils.toJavaDate(0), 0, 0.toShort, 0.toByte, false), + // Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100), 100.toLong, + // -(0.toFloat), DateTimeUtils.toJavaDate(100), 100, 100.toShort, 100.toByte, true), + // Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100), + // -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100), -100, -100.toShort, + // -100.toByte, true), + // Row("All work and no play makes Jack a dull boy", Double.MinValue, + // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), Long.MinValue, Float.MinValue, + // DateTimeUtils.toJavaDate(Int.MinValue/100), Int.MinValue, Short.MinValue, Byte.MinValue, + // true), + // Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), Long.MaxValue, Float.MaxValue, + // DateTimeUtils.toJavaDate(Int.MaxValue/100), Int.MaxValue, Short.MaxValue, Byte.MaxValue, + // false)) + // val df = spark.createDataFrame(sc.parallelize(data), schema) + // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) + // df.select(hash(col("*"))).collect + fixed_width_column_wrapper const hash_strings_expected( {1467149710, 723257560, -1620282500, -2001858707, 1588473657}); - fixed_width_column_wrapper const ints_col_result( + fixed_width_column_wrapper const hash_doubles_expected( + {-1670924195, -853646085, -1281358385, 1897734433, -508695674}); + fixed_width_column_wrapper const hash_timestamps_expected( + {-1670924195, 1114849490, 904948192, -1832979433, 1752430209}); + fixed_width_column_wrapper const hash_longs_expected( + {-1670924195, 1114849490, 904948192, -853646085, -1604625029}); + fixed_width_column_wrapper const hash_floats_expected( + {933211791, 723455942, -349261430, -1225560532, -338752985}); + fixed_width_column_wrapper const hash_dates_expected( + {933211791, 751823303, -1080202046, -1906567553, -1503850410}); + fixed_width_column_wrapper const hash_ints_expected( {933211791, 751823303, -1080202046, 723455942, 133916647}); + fixed_width_column_wrapper const hash_shorts_expected( + {933211791, 751823303, -1080202046, -1871935946, 1249274084}); + fixed_width_column_wrapper const hash_bytes_expected( + {933211791, 751823303, -1080202046, 1110053733, 1135925485}); + fixed_width_column_wrapper const hash_bools_expected( + {933211791, -559580957, -559580957, -559580957, 933211791}); + fixed_width_column_wrapper const hash_combined_expected( + {969935434, 595083937, 720326214, 971129823, -1060339603}); strings_column_wrapper const strings_col({"", "The quick brown fox", @@ -278,26 +331,64 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) "All work and no play makes Jack a dull boy", "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}); - using limits = std::numeric_limits; - fixed_width_column_wrapper const ints_col({0, 100, -100, limits::min(), limits::max()}); - + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + fixed_width_column_wrapper const doubles_col( + {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()}); + fixed_width_column_wrapper const timestamps_col( + {0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000}); + fixed_width_column_wrapper const longs_col( + {0L, 100L, -100L, long_limits::min(), long_limits::max()}); + fixed_width_column_wrapper const floats_col( + {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()}); + fixed_width_column_wrapper dates_col( + {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100}); + fixed_width_column_wrapper const ints_col( + {0, 100, -100, int_limits::min(), int_limits::max()}); + fixed_width_column_wrapper const shorts_col({0, 100, -100, -32768, 32767}); + fixed_width_column_wrapper const bytes_col({0, 100, -100, -128, 127}); fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); - auto const input1 = cudf::table_view({strings_col}); - auto const input2 = cudf::table_view({ints_col}); - auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2}); - - auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 314); - auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 42); - auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SPARK_MURMUR3, {}); - auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SPARK_MURMUR3, {}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true); - EXPECT_EQ(input3.num_rows(), hashed_output3->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true); + constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); + auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42); + auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42); + auto const hash_longs = cudf::hash(cudf::table_view({longs_col}), hasher, {}, 42); + auto const hash_floats = cudf::hash(cudf::table_view({floats_col}), hasher, {}, 42); + auto const hash_dates = cudf::hash(cudf::table_view({dates_col}), hasher, {}, 42); + auto const hash_ints = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42); + auto const hash_shorts = cudf::hash(cudf::table_view({shorts_col}), hasher, {}, 42); + auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, {}, 42); + auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42); + auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true); + + auto const combined_table = cudf::table_view({strings_col, + doubles_col, + timestamps_col, + longs_col, + floats_col, + dates_col, + ints_col, + shorts_col, + bytes_col, + bools_col2}); + auto const hash_combined = cudf::hash(combined_table, hasher, {}, 42); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true); } class MD5HashTest : public cudf::test::BaseFixture { diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 9f414661967..3bc9adb5f49 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -606,7 +606,6 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp"; assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; columnViews[i] = columns[i].getNativeView(); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 420e176efe2..673b8a67467 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -527,6 +527,26 @@ void testSpark32BitMurmur3HashDoubles() { } } + @Test + void testSpark32BitMurmur3HashTimestamps() { + try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( + 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testSpark32BitMurmur3HashDates() { + try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( + 0, null, 100, -100, 0x12345678, null, -0x12345678); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testSpark32BitMurmur3HashFloats() { try (ColumnVector v = ColumnVector.fromBoxedFloats(