From d4715249886522904b38236e996f369844a99e9f Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 24 Mar 2021 09:09:30 -0500 Subject: [PATCH 1/3] Struct hashing support for serial Murmur3 and SparkMurmur3 --- .../cudf/detail/utilities/hash_functions.cuh | 32 +++++ cpp/src/hash/hashing.cu | 23 +++- cpp/tests/hashing/hash_test.cpp | 125 ++++++++++++------ .../java/ai/rapids/cudf/ColumnVector.java | 5 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 48 ++++++- 5 files changed, 184 insertions(+), 49 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 31533a69487..e79107e32cf 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -542,6 +542,22 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(double c return this->compute_floating_point(key); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + template struct SparkMurmurHash3_32 { using argument_type = Key; @@ -671,6 +687,22 @@ SparkMurmurHash3_32::operator()(numeric::decimal64 const& ke return this->compute(key.value()); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + /** * @brief Specialization of MurmurHash3_32 operator for strings. */ diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 16efb666b3e..ba77df198bf 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,22 @@ bool md5_type_check(data_type dt) return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING)); } +template +std::vector to_leaf_columns(IterType iter_begin, IterType iter_end) +{ + std::vector leaf_columns; + for_each(iter_begin, iter_end, [&](column_view const& col) { + if (is_nested(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type"); + auto child_columns = to_leaf_columns(col.child_begin(), col.child_end()); + leaf_columns.insert(leaf_columns.end(), child_columns.begin(), child_columns.end()); + } else { + leaf_columns.emplace_back(col); + } + }); + return leaf_columns; +} + } // namespace namespace detail { @@ -133,10 +149,11 @@ std::unique_ptr serial_murmur_hash3_32(table_view const& input, if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } - auto const device_input = table_device_view::create(input, stream); + table_view const leaf_table(to_leaf_columns(input.begin(), input.end())); + auto const device_input = table_device_view::create(leaf_table, stream); auto output_view = output->mutable_view(); - if (has_nulls(input)) { + if (has_nulls(leaf_table)) { thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), output_view.end(), diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 5641d445ff3..d928a17b3d1 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -257,20 +257,35 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); - auto const input1 = cudf::table_view({strings_col}); - auto const input2 = cudf::table_view({ints_col}); - auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2}); - - auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 314); - auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 42); - auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true); - EXPECT_EQ(input3.num_rows(), hashed_output3->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true); + std::vector> struct_field_cols; + struct_field_cols.emplace_back(std::make_unique(strings_col)); + struct_field_cols.emplace_back(std::make_unique(ints_col)); + struct_field_cols.emplace_back(std::make_unique(bools_col1)); + structs_column_wrapper structs_col(std::move(struct_field_cols)); + + auto const combo1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2}); + + constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3; + auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); + auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42); + auto const combo1_hash = cudf::hash(combo1, hasher, {}); + auto const combo2_hash = cudf::hash(combo2, hasher, {}); + auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*strings_hash, strings_col_result, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ints_hash, ints_col_result, true); + EXPECT_EQ(combo1.num_rows(), combo1_hash->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*combo1_hash, *combo2_hash, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*structs_hash, *combo1_hash, true); +} + +TEST_F(SerialMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SERIAL_MURMUR3, {}), + cudf::logic_error); } class SparkMurmurHash3Test : public cudf::test::BaseFixture { @@ -280,31 +295,38 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) { // The hash values were determined by running the following Scala code in Apache Spark: // import org.apache.spark.sql.catalyst.util.DateTimeUtils - // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType) - // .add("timestamps",TimestampType).add("decimal64", DecimalType(18,7)).add("longs",LongType) - // .add("floats",FloatType).add("dates",DateType).add("decimal32", DecimalType(9,3)) - // .add("ints",IntegerType).add("shorts",ShortType).add("bytes",ByteType) - // .add("bools",BooleanType) + // val schema = new StructType().add("structs", new StructType().add("a",IntegerType) + // .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType))) + // .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType) + // .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType) + // .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType) + // .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType) // val data = Seq( - // Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat, - // DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, false), - // Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100), - // BigDecimal("0.00001"), 100.toLong, -(0.toFloat), DateTimeUtils.toJavaDate(100), - // BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), - // Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100), - // BigDecimal("-0.00001"), -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100), - // BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, true), - // Row("All work and no play makes Jack a dull boy", Double.MinValue, - // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), - // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), - // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), - // Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, - // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), - // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), - // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) + // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), + // 0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, + // false), + // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble), + // DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat), + // DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), + // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN, + // DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN, + // DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, + // true), + // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)), + // "All work and no play makes Jack a dull boy", Double.MinValue, + // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), + // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), + // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), + // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)), + // "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), + // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), + // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) // val df = spark.createDataFrame(sc.parallelize(data), schema) // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) // df.select(hash(col("*"))).collect + fixed_width_column_wrapper const hash_structs_expected( + {-105406170, 90479889, -678041645, 1667387937, 301478567}); fixed_width_column_wrapper const hash_strings_expected( {1467149710, 723257560, -1620282500, -2001858707, 1588473657}); fixed_width_column_wrapper const hash_doubles_expected( @@ -330,18 +352,26 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const hash_bools_expected( {933211791, -559580957, -559580957, -559580957, 933211791}); fixed_width_column_wrapper const hash_combined_expected( - {-1947042614, -1731440908, 807283935, 725489209, 822276819}); + {-1172364561, -442972638, 1213234395, 796626751, 214075225}); + + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + fixed_width_column_wrapper a_col{0, 100, -100, 0x12345678, -0x76543210}; + strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"}; + fixed_width_column_wrapper x_col{ + 0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()}; + fixed_width_column_wrapper y_col{ + 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL}; + structs_column_wrapper c_col{{x_col, y_col}}; + structs_column_wrapper const structs_col{{a_col, b_col, c_col}}; strings_column_wrapper const strings_col({"", "The quick brown fox", "jumps over the lazy dog.", "All work and no play makes Jack a dull boy", "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}); - - using double_limits = std::numeric_limits; - using long_limits = std::numeric_limits; - using float_limits = std::numeric_limits; - using int_limits = std::numeric_limits; fixed_width_column_wrapper const doubles_col( {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()}); fixed_width_column_wrapper const timestamps_col( @@ -364,6 +394,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42); auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42); auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42); @@ -378,6 +409,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42); auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true); @@ -392,7 +424,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true); - auto const combined_table = cudf::table_view({strings_col, + auto const combined_table = cudf::table_view({structs_col, + strings_col, doubles_col, timestamps_col, decimal64_col, @@ -408,6 +441,14 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true); } +TEST_F(SparkMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SPARK_MURMUR3, {}), + cudf::logic_error); +} + class MD5HashTest : public cudf::test::BaseFixture { }; diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e6675591164..fcdb5d44ad3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -570,8 +570,7 @@ public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[] assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), new int[0], seed)); @@ -606,7 +605,7 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), new int[0], seed)); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 02fbe56431b..073a24ca738 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -490,6 +490,25 @@ void testSerial32BitMurmur3HashMixed() { } } + @Test + void testSerial32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools}); + ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testSpark32BitMurmur3HashStrings() { try (ColumnVector v0 = ColumnVector.fromStrings( @@ -529,6 +548,8 @@ void testSpark32BitMurmur3HashDoubles() { @Test void testSpark32BitMurmur3HashTimestamps() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -539,6 +560,8 @@ void testSpark32BitMurmur3HashTimestamps() { @Test void testSpark32BitMurmur3HashDecimal64() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromLongs(-7, 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -549,6 +572,8 @@ void testSpark32BitMurmur3HashDecimal64() { @Test void testSpark32BitMurmur3HashDecimal32() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromInts(-3, 0, 100, -100, 0x12345678, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -559,6 +584,8 @@ void testSpark32BitMurmur3HashDecimal32() { @Test void testSpark32BitMurmur3HashDates() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( 0, null, 100, -100, 0x12345678, null, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -587,7 +614,6 @@ void testSpark32BitMurmur3HashBools() { ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1}); ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) { assertColumnsAreEqual(expected, result); - } } @@ -610,6 +636,26 @@ void testSpark32BitMurmur3HashMixed() { } } + @Test + void testSpark32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs}); + ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testAndNullReconfigureNulls() { try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); From 32ffab18317bc8b14aec976491712882586f68c0 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 30 Mar 2021 08:40:06 -0500 Subject: [PATCH 2/3] Explicity specify lambda captures --- cpp/src/hash/hashing.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index ba77df198bf..6183840b698 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -42,7 +42,7 @@ template std::vector to_leaf_columns(IterType iter_begin, IterType iter_end) { std::vector leaf_columns; - for_each(iter_begin, iter_end, [&](column_view const& col) { + for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) { if (is_nested(col.type())) { CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type"); auto child_columns = to_leaf_columns(col.child_begin(), col.child_end()); From 87687c9c1302c20965c16cc2bb289170c7205394 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 30 Mar 2021 09:08:24 -0500 Subject: [PATCH 3/3] Qualify namespace of for_each --- cpp/src/hash/hashing.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 6183840b698..53be019f73b 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -29,6 +29,8 @@ #include +#include + namespace cudf { namespace { @@ -42,7 +44,7 @@ template std::vector to_leaf_columns(IterType iter_begin, IterType iter_end) { std::vector leaf_columns; - for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) { + std::for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) { if (is_nested(col.type())) { CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type"); auto child_columns = to_leaf_columns(col.child_begin(), col.child_end());