From aa7ca46dcd9ebbeef86261b188735e58cfb4c3ca Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 24 Mar 2021 14:58:18 -0500 Subject: [PATCH] Fix SparkMurmurHash3_32 hash inconsistencies with Apache Spark (#7672) #7024 added a Spark variant of Murmur3 hashing, but it is inconsistent with Apache Spark's hash calculations in a few areas: - `-0.0` and `0.0` are not treated the same by Apache Spark for floats and doubles - byte and short integral values are upcast to a 32-bit unsigned int (i.e.: zero-filled) before calculating the hash In addition libcudf allows hashing of timestamp columns but the JNI bindings asserted if timestamp columns were passed in, disabling the ability to hash on timestamps directly. Authors: - Jason Lowe (@jlowe) Approvers: - Nghia Truong (@ttnghia) - Jake Hemstad (@jrhemstad) - Alessandro Bellina (@abellina) - MithunR (@mythrocks) - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7672 --- .../cudf/detail/utilities/hash_functions.cuh | 49 ++++- cpp/tests/hashing/hash_test.cpp | 174 ++++++++++++++---- .../java/ai/rapids/cudf/ColumnVector.java | 1 - .../java/ai/rapids/cudf/ColumnVectorTest.java | 40 ++++ 4 files changed, 226 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index a2e7d6d4aae..31533a69487 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -570,9 +571,7 @@ struct SparkMurmurHash3_32 { template ::value>* = nullptr> hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const { - if (key == T{0.0}) { - return compute(T{0.0}); - } else if (isnan(key)) { + if (isnan(key)) { T nan = std::numeric_limits::quiet_NaN(); return compute(nan); } else { @@ -630,6 +629,48 @@ hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32::operator()(bool return this->compute(key); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(int8_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(uint8_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(int16_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(uint16_t const& key) const +{ + return this->compute(key); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(numeric::decimal32 const& key) const +{ + return this->compute(key.value()); +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(numeric::decimal64 const& key) const +{ + return this->compute(key.value()); +} + /** * @brief Specialization of MurmurHash3_32 operator for strings. */ diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index f0d13390edf..5641d445ff3 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -201,27 +202,37 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes) T nan = std::numeric_limits::quiet_NaN(); T inf = std::numeric_limits::infinity(); - fixed_width_column_wrapper const col1({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); - fixed_width_column_wrapper const col2( - {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + fixed_width_column_wrapper const col({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + fixed_width_column_wrapper const col_neg_zero( + {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + fixed_width_column_wrapper const col_neg_nan( + {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + auto const table_col = cudf::table_view({col}); + auto const table_col_neg_zero = cudf::table_view({col_neg_zero}); + auto const table_col_neg_nan = cudf::table_view({col_neg_nan}); - auto const output1 = cudf::hash(input1); - auto const output2 = cudf::hash(input2); + auto const hash_col = cudf::hash(table_col); + auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero); + auto const hash_col_neg_nan = cudf::hash(table_col_neg_nan); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, true); - auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0); - auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3); + constexpr auto serial_hasher = cudf::hash_id::HASH_SERIAL_MURMUR3; + auto const serial_col = cudf::hash(table_col, serial_hasher, {}, 0); + auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher); + auto const serial_col_neg_nan = cudf::hash(table_col_neg_nan, serial_hasher); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_zero, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_nan, true); - auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0); - auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3); + // Spark hash is sensitive to 0 and -0 + constexpr auto spark_hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const spark_col = cudf::hash(table_col, spark_hasher, {}, 0); + auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan); } class SerialMurmurHash3Test : public cudf::test::BaseFixture { @@ -267,10 +278,59 @@ class SparkMurmurHash3Test : public cudf::test::BaseFixture { TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) { - fixed_width_column_wrapper const strings_col_result( + // The hash values were determined by running the following Scala code in Apache Spark: + // import org.apache.spark.sql.catalyst.util.DateTimeUtils + // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType) + // .add("timestamps",TimestampType).add("decimal64", DecimalType(18,7)).add("longs",LongType) + // .add("floats",FloatType).add("dates",DateType).add("decimal32", DecimalType(9,3)) + // .add("ints",IntegerType).add("shorts",ShortType).add("bytes",ByteType) + // .add("bools",BooleanType) + // val data = Seq( + // Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat, + // DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, false), + // Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100), + // BigDecimal("0.00001"), 100.toLong, -(0.toFloat), DateTimeUtils.toJavaDate(100), + // BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), + // Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100), + // BigDecimal("-0.00001"), -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100), + // BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, true), + // Row("All work and no play makes Jack a dull boy", Double.MinValue, + // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), + // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), + // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), + // Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), + // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), + // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) + // val df = spark.createDataFrame(sc.parallelize(data), schema) + // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) + // df.select(hash(col("*"))).collect + fixed_width_column_wrapper const hash_strings_expected( {1467149710, 723257560, -1620282500, -2001858707, 1588473657}); - fixed_width_column_wrapper const ints_col_result( + fixed_width_column_wrapper const hash_doubles_expected( + {-1670924195, -853646085, -1281358385, 1897734433, -508695674}); + fixed_width_column_wrapper const hash_timestamps_expected( + {-1670924195, 1114849490, 904948192, -1832979433, 1752430209}); + fixed_width_column_wrapper const hash_decimal64_expected( + {-1670924195, 1114849490, 904948192, 1962370902, -1795328666}); + fixed_width_column_wrapper const hash_longs_expected( + {-1670924195, 1114849490, 904948192, -853646085, -1604625029}); + fixed_width_column_wrapper const hash_floats_expected( + {933211791, 723455942, -349261430, -1225560532, -338752985}); + fixed_width_column_wrapper const hash_dates_expected( + {933211791, 751823303, -1080202046, -1906567553, -1503850410}); + fixed_width_column_wrapper const hash_decimal32_expected( + {-1670924195, 1114849490, 904948192, -1454351396, -193774131}); + fixed_width_column_wrapper const hash_ints_expected( {933211791, 751823303, -1080202046, 723455942, 133916647}); + fixed_width_column_wrapper const hash_shorts_expected( + {933211791, 751823303, -1080202046, -1871935946, 1249274084}); + fixed_width_column_wrapper const hash_bytes_expected( + {933211791, 751823303, -1080202046, 1110053733, 1135925485}); + fixed_width_column_wrapper const hash_bools_expected( + {933211791, -559580957, -559580957, -559580957, 933211791}); + fixed_width_column_wrapper const hash_combined_expected( + {-1947042614, -1731440908, 807283935, 725489209, 822276819}); strings_column_wrapper const strings_col({"", "The quick brown fox", @@ -278,26 +338,74 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) "All work and no play makes Jack a dull boy", "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}); - using limits = std::numeric_limits; - fixed_width_column_wrapper const ints_col({0, 100, -100, limits::min(), limits::max()}); - + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + fixed_width_column_wrapper const doubles_col( + {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()}); + fixed_width_column_wrapper const timestamps_col( + {0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000}); + fixed_point_column_wrapper const decimal64_col( + {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7}); + fixed_width_column_wrapper const longs_col( + {0L, 100L, -100L, long_limits::min(), long_limits::max()}); + fixed_width_column_wrapper const floats_col( + {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()}); + fixed_width_column_wrapper dates_col( + {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100}); + fixed_point_column_wrapper const decimal32_col({0, 100, -100, -999999999, 999999999}, + numeric::scale_type{-3}); + fixed_width_column_wrapper const ints_col( + {0, 100, -100, int_limits::min(), int_limits::max()}); + fixed_width_column_wrapper const shorts_col({0, 100, -100, -32768, 32767}); + fixed_width_column_wrapper const bytes_col({0, 100, -100, -128, 127}); fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); - auto const input1 = cudf::table_view({strings_col}); - auto const input2 = cudf::table_view({ints_col}); - auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2}); - - auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 314); - auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 42); - auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SPARK_MURMUR3, {}); - auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SPARK_MURMUR3, {}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true); - EXPECT_EQ(input3.num_rows(), hashed_output3->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true); + constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); + auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42); + auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42); + auto const hash_decimal64 = cudf::hash(cudf::table_view({decimal64_col}), hasher, {}, 42); + auto const hash_longs = cudf::hash(cudf::table_view({longs_col}), hasher, {}, 42); + auto const hash_floats = cudf::hash(cudf::table_view({floats_col}), hasher, {}, 42); + auto const hash_dates = cudf::hash(cudf::table_view({dates_col}), hasher, {}, 42); + auto const hash_decimal32 = cudf::hash(cudf::table_view({decimal32_col}), hasher, {}, 42); + auto const hash_ints = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42); + auto const hash_shorts = cudf::hash(cudf::table_view({shorts_col}), hasher, {}, 42); + auto const hash_bytes = cudf::hash(cudf::table_view({bytes_col}), hasher, {}, 42); + auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42); + auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true); + + auto const combined_table = cudf::table_view({strings_col, + doubles_col, + timestamps_col, + decimal64_col, + longs_col, + floats_col, + dates_col, + decimal32_col, + ints_col, + shorts_col, + bytes_col, + bools_col2}); + auto const hash_combined = cudf::hash(combined_table, hasher, {}, 42); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true); } class MD5HashTest : public cudf::test::BaseFixture { diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index defb6eea5b9..e6675591164 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -606,7 +606,6 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp"; assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; columnViews[i] = columns[i].getNativeView(); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 00d6e51fd91..02fbe56431b 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -527,6 +527,46 @@ void testSpark32BitMurmur3HashDoubles() { } } + @Test + void testSpark32BitMurmur3HashTimestamps() { + try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( + 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testSpark32BitMurmur3HashDecimal64() { + try (ColumnVector v = ColumnVector.decimalFromLongs(-7, + 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, 657182333, -57193045)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testSpark32BitMurmur3HashDecimal32() { + try (ColumnVector v = ColumnVector.decimalFromInts(-3, + 0, 100, -100, 0x12345678, -0x12345678); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, -958054811, -1447702630)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testSpark32BitMurmur3HashDates() { + try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( + 0, null, 100, -100, 0x12345678, null, -0x12345678); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testSpark32BitMurmur3HashFloats() { try (ColumnVector v = ColumnVector.fromBoxedFloats(