From 1cb527f01aac631f4d44866b5474e503501d58cd Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 23 Sep 2021 01:26:59 +0530 Subject: [PATCH] Add shallow hash function and shallow equality comparison for column_view (#9185) Fixes #9140 Added `shallow_hash(column_view)` Added unit tests It computes hash values based on the shallow states of `column_view`: type, size, data pointer, null_mask pointer, offset, and the hash value of the children. `null_count` is not used since it is a cached value and it may vary based on contents of `null_mask`, and may be pre-computed or not. Fixes #9139 Added `is_shallow_equivalent(column_view, column_view)` ~shallow_equal~ Added unit tests It compares two column_views based on the shallow states of column_view: type, size, data pointer, null_mask pointer, offset, and the column_view of the children. null_count is not used since it is a cached value and it may vary based on contents of null_mask, and may be pre-computed or not. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Mark Harris (https://github.com/harrism) - Vyas Ramasubramani (https://github.com/vyasr) - Jake Hemstad (https://github.com/jrhemstad) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/9185 --- cpp/include/cudf/column/column_view.hpp | 41 ++ cpp/include/cudf/detail/hashing.hpp | 36 ++ .../cudf/detail/utilities/hash_functions.cuh | 12 + cpp/include/cudf_test/type_lists.hpp | 12 + cpp/src/column/column_view.cpp | 55 +++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/column/column_view_shallow_test.cpp | 442 ++++++++++++++++++ 7 files changed, 599 insertions(+) create mode 100644 cpp/tests/column/column_view_shallow_test.cpp diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 7feaeafbad0..cd490c3c832 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -633,4 +633,45 @@ column_view bit_cast(column_view const& input, data_type type); */ mutable_column_view bit_cast(mutable_column_view const& input, data_type type); +namespace detail { +/** + * @brief Computes a hash value from the shallow state of the specified column + * + * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) == + * shallow_hash(c1)`. + * + * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., + * it is independent of the number of elements in the column. + * + * This function does _not_ inspect the elements of `input` nor access any device memory or launch + * any kernels. + * + * @param input The `column_view` to compute hash + * @return The hash value derived from the shallow state of `input`. + */ +std::size_t shallow_hash(column_view const& input); + +/** + * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns + * + * Two columns are equivalent if for any operation `F` then: + * ``` + * is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent + * ``` + * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact + * same physical column. In other words, two physically independent columns may have exactly + * equivalent elements but their shallow state would not be equivalent. + * + * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`, + * i.e., it is independent of the number of elements in either column. + * + * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor + * launch any kernels. + * + * @param lhs The left `column_view` to compare + * @param rhs The right `column_view` to compare + * @return If `lhs` and `rhs` have equivalent shallow state + */ +bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs); +} // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index 83d6be14709..bd5c8a42a51 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -19,6 +19,9 @@ #include +#include +#include + namespace cudf { namespace detail { @@ -53,5 +56,38 @@ std::unique_ptr serial_murmur_hash3_32( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/* Copyright 2005-2014 Daniel James. + * + * Use, modification and distribution is subject to the Boost Software + * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +/** + * @brief Combines two hashed values into a single hashed value. + * + * Adapted from Boost hash_combine function, modified for 64-bit + * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html + * + * @param lhs The first hashed value + * @param rhs The second hashed value + * @return Combined hash value + */ +constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) +{ + lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2); + return lhs; +} } // namespace detail } // namespace cudf + +// specialization of std::hash for cudf::data_type +namespace std { +template <> +struct hash { + std::size_t operator()(cudf::data_type const& type) const noexcept + { + return cudf::detail::hash_combine(std::hash{}(static_cast(type.id())), + std::hash{}(type.scale())); + } +}; +} // namespace std diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 6eab13ae9af..65deadd6cd0 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -395,6 +395,12 @@ struct MurmurHash3_32 { return h; } + /* Copyright 2005-2014 Daniel James. + * + * Use, modification and distribution is subject to the Boost Software + * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ /** * @brief Combines two hash values into a new single hash value. Called * repeatedly to create a hash value from several variables. @@ -795,6 +801,12 @@ struct IdentityHash { IdentityHash() = default; constexpr IdentityHash(uint32_t seed) : m_seed(seed) {} + /* Copyright 2005-2014 Daniel James. + * + * Use, modification and distribution is subject to the Boost Software + * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ /** * @brief Combines two hash values into a new single hash value. Called * repeatedly to create a hash value from several variables. diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp index 74688b7f133..982c94ac402 100644 --- a/cpp/include/cudf_test/type_lists.hpp +++ b/cpp/include/cudf_test/type_lists.hpp @@ -315,6 +315,18 @@ using FixedWidthTypesWithoutChrono = Concat; */ using ComparableTypes = Concat; +/** + * @brief Provides a list of all compound types for use in GTest typed tests. + * + * Example: + * ``` + * // Invokes all typed fixture tests for all compound types in libcudf + * TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes); + * ``` + */ +using CompoundTypes = + cudf::test::Types; + /** * @brief Provides a list of all types supported in libcudf for use in a GTest * typed test. diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 186669ae697..5749cb48c0e 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -22,6 +23,7 @@ #include +#include #include #include #include @@ -76,6 +78,59 @@ size_type column_view_base::null_count(size_type begin, size_type end) const ? 0 : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end); } + +// Struct to use custom hash combine and fold expression +struct HashValue { + std::size_t hash; + explicit HashValue(std::size_t h) : hash{h} {} + HashValue operator^(HashValue const& other) const + { + return HashValue{hash_combine(hash, other.hash)}; + } +}; + +template +constexpr auto hash(Ts&&... ts) +{ + return (... ^ HashValue(std::hash{}(ts))).hash; +} + +std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false) +{ + std::size_t const init = (is_parent_empty or c.is_empty()) + ? hash(c.type(), 0) + : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset()); + return std::accumulate(c.child_begin(), + c.child_end(), + init, + [&c, is_parent_empty](std::size_t hash, auto const& child) { + return hash_combine( + hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty)); + }); +} + +std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); } + +bool shallow_equivalent_impl(column_view const& lhs, + column_view const& rhs, + bool is_parent_empty = false) +{ + bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty; + return (lhs.type() == rhs.type()) and + (is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and + (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and + std::equal(lhs.child_begin(), + lhs.child_end(), + rhs.child_begin(), + rhs.child_end(), + [is_empty](auto const& lhs_child, auto const& rhs_child) { + return shallow_equivalent_impl(lhs_child, rhs_child, is_empty); + }); +} +bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs) +{ + return shallow_equivalent_impl(lhs, rhs); +} } // namespace detail // Immutable view constructor diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 03f7967cee0..cde170fb598 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -33,6 +33,7 @@ endfunction() # - column tests ---------------------------------------------------------------------------------- ConfigureTest(COLUMN_TEST column/bit_cast_test.cpp + column/column_view_shallow_test.cpp column/column_test.cu column/column_device_view_test.cu column/compound_test.cu) diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp new file mode 100644 index 00000000000..f76f682bb2f --- /dev/null +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +// fixed_width, dict, string, list, struct +template ()>* = nullptr> +std::unique_ptr example_column() +{ + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + return cudf::test::fixed_width_column_wrapper(begin, end).release(); +} + +template ()>* = nullptr> +std::unique_ptr example_column() +{ + return cudf::test::dictionary_column_wrapper( + {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0}) + .release(); +} + +template or + std::is_same_v>* = nullptr> +std::unique_ptr example_column() + +{ + return cudf::test::strings_column_wrapper( + {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}) + .release(); +} + +template >* = nullptr> +std::unique_ptr example_column() +{ + return cudf::test::lists_column_wrapper({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release(); +} + +template >* = nullptr> +std::unique_ptr example_column() +{ + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + auto member_0 = cudf::test::fixed_width_column_wrapper(begin, end); + auto member_1 = cudf::test::fixed_width_column_wrapper(begin + 10, end + 10); + return cudf::test::structs_column_wrapper({member_0, member_1}).release(); +} + +template +struct ColumnViewShallowTests : public cudf::test::BaseFixture { +}; + +using AllTypes = cudf::test::Concat; +TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes); + +// Test for fixed_width, dict, string, list, struct +// column_view, column_view = same hash. +// column_view, make a copy = same hash. +// new column_view from colmn = same hash +// column_view, copy column = diff hash +// column_view, diff column = diff hash. +// +// column_view old, update data + new column_view = same hash. +// column_view old, add null_mask + new column_view = diff hash. +// column_view old, update nulls + new column_view = same hash. +// column_view old, set_null_count + new column_view = same hash. +// +// column_view, sliced[0, size) = same hash (for split too) +// column_view, sliced[n:) = diff hash (for split too) +// column_view, bit_cast = diff hash +// +// mutable_column_view, column_view = same hash +// mutable_column_view, modified mutable_column_view = same hash +// +// update the children column data = same hash +// update the children column_views = diff hash + +TYPED_TEST(ColumnViewShallowTests, shallow_hash_basic) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // same = same hash + { + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view)); + } + // copy column_view = same hash + { + auto col_view_copy = col_view; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy)); + } + + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + + // copy column = diff hash + { + auto col_new = std::make_unique(*col); + auto col_view_copy = col_new->view(); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_copy)); + } + + // column_view, diff column = diff hash. + { + auto col_diff = example_column(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff)); + } +} +TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // update data + new column_view = same hash. + { + // update data by modifying some bits: fixed_width, string, dict, list, struct + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + // add null_mask + new column_view = diff hash. + { + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); + auto col_view_new = cudf::column_view{*col}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new)); + col_view_new.null_count(); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new)); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2)); + } + col_view = cudf::column_view{*col}; // updating after adding null_mask + // update nulls + new column_view = same hash. + { + cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) + { + col->set_null_count(cudf::UNKNOWN_NULL_COUNT); + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + col->set_null_count(col->size()); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new2)); + } +} + +TYPED_TEST(ColumnViewShallowTests, shallow_hash_slice) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // column_view, sliced[0, size) = same hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_sliced[0])); + auto col_split = cudf::split(col_view, {0}); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0])); + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_split[1])); + } + // column_view, sliced[n:] = diff hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_sliced[0])); + auto col_split = cudf::split(col_view, {1}); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0])); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[1])); + } + // column_view, col copy sliced[0, 0) = same hash (empty column) + { + auto col_new = std::make_unique(*col); + auto col_new_view = col_new->view(); + auto col_sliced = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + + EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_sliced[1])); + EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_sliced[2])); + EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_new_sliced[0])); + EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_new_sliced[1])); + EXPECT_EQ(shallow_hash(col_sliced[2]), shallow_hash(col_new_sliced[2])); + } + + // column_view, bit_cast = diff hash + { + if constexpr (std::is_integral_v and not std::is_same_v) { + using newType = std::conditional_t, + std::make_unsigned_t, + std::make_signed_t>; + auto new_type = cudf::data_type(cudf::type_to_id()); + auto col_bitcast = cudf::bit_cast(col_view, new_type); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_bitcast)); + } + } +} + +TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // mutable_column_view, column_view = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_view)); + } + // mutable_column_view, modified mutable_column_view = same hash + // update the children column data = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_mutable)); + auto col_mutable_new = cudf::mutable_column_view{*col}; + EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_mutable_new)); + } + // update the children column_views = diff hash + { + if constexpr (cudf::is_nested()) { + col->child(0).set_null_mask( + cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); + auto col_child_updated = cudf::mutable_column_view{*col}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_child_updated)); + } + } +} + +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_basic) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // same = same hash + { + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view)); + } + // copy column_view = same hash + { + auto col_view_copy = col_view; + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_copy)); + } + + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); + } + + // copy column = diff hash + { + auto col_new = std::make_unique(*col); + auto col_view_copy = col_new->view(); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_copy)); + } + + // column_view, diff column = diff hash. + { + auto col_diff = example_column(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_diff)); + } +} +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // update data + new column_view = same hash. + { + // update data by modifying some bits: fixed_width, string, dict, list, struct + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); + } + // add null_mask + new column_view = diff hash. + { + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); + auto col_view_new = cudf::column_view{*col}; + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new)); + col_view_new.null_count(); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new)); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2)); + } + col_view = cudf::column_view{*col}; // updating after adding null_mask + // update nulls + new column_view = same hash. + { + cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); + } + // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) + { + col->set_null_count(cudf::UNKNOWN_NULL_COUNT); + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); + col->set_null_count(col->size()); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new2)); + } +} + +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_slice) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // column_view, sliced[0, size) = same hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_sliced[0])); + auto col_split = cudf::split(col_view, {0}); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0])); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_split[1])); + } + // column_view, sliced[n:] = diff hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_sliced[0])); + auto col_split = cudf::split(col_view, {1}); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0])); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[1])); + } + // column_view, col copy sliced[0, 0) = same hash (empty column) + { + auto col_new = std::make_unique(*col); + auto col_new_view = col_new->view(); + auto col_sliced = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + + EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_sliced[1])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_sliced[2])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_new_sliced[0])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_new_sliced[1])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[2], col_new_sliced[2])); + } + + // column_view, bit_cast = diff hash + { + if constexpr (std::is_integral_v and not std::is_same_v) { + using newType = std::conditional_t, + std::make_unsigned_t, + std::make_signed_t>; + auto new_type = cudf::data_type(cudf::type_to_id()); + auto col_bitcast = cudf::bit_cast(col_view, new_type); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_bitcast)); + } + } +} + +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_mutable) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // mutable_column_view, column_view = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_view)); + } + // mutable_column_view, modified mutable_column_view = same hash + // update the children column data = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } + EXPECT_TRUE(is_shallow_equivalent(col_view, col_mutable)); + auto col_mutable_new = cudf::mutable_column_view{*col}; + EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_mutable_new)); + } + // update the children column_views = diff hash + { + if constexpr (cudf::is_nested()) { + col->child(0).set_null_mask( + cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); + auto col_child_updated = cudf::mutable_column_view{*col}; + EXPECT_FALSE(is_shallow_equivalent(col_view, col_child_updated)); + } + } +}