diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 43386e926d2..a77351fe731 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -652,5 +652,22 @@ namespace detail { * @return The hash value */ size_t shallow_hash(column_view const& input); + +/** + * @brief Equality operator for column views based on the shallow state of the column view. + * + * Only shallow states used for the hash computation are: type, size, data pointer, null_mask + * pointer, offset and the column_view of the children recursively. Note that `null_count` is not + * used. + * + * Note: This equality function will consider a column not equal to a copy of the same column with + * exactly same contents. It is guarenteed to return true for same column_view only, even if the + * underlying data changes. + * + * @param lhs The left `column_view` to compare + * @param rhs The right `column_view` to compare + * @return true if the shallow states of the two column views are equal + */ +bool shallow_equal(column_view const& lhs, column_view const& rhs); } // namespace detail } // namespace cudf diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index d1202108ae5..7e0bde86b74 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -95,6 +95,20 @@ size_t shallow_hash(column_view const& input) }); return hash; } + +bool is_shallow_equal(column_view const& lhs, column_view const& rhs) +{ + return (lhs.type() == rhs.type()) and (lhs.size() == rhs.size()) and + (lhs.head() == rhs.head()) and (lhs.null_mask() == rhs.null_mask()) and + (lhs.offset() == rhs.offset()) and + std::equal(lhs.child_begin(), + lhs.child_end(), + rhs.child_begin(), + rhs.child_end(), + [](auto const& lhs_child, auto const& rhs_child) { + return is_shallow_equal(lhs_child, rhs_child); + }); +} } // namespace detail // Immutable view constructor diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index b0f6eeac450..25af9b968e6 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -217,3 +217,136 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) } } } + +TYPED_TEST(ColumnViewShallowTests, shallow_equal) +{ + using namespace cudf::detail; + auto col = example_column<TypeParam>(); + auto col_view = cudf::column_view{*col}; + // same = same hash + { + EXPECT_TRUE(shallow_equal(col_view, col_view)); + } + // copy column_view = same hash + { + auto col_view_copy = col_view; + EXPECT_TRUE(shallow_equal(col_view, col_view_copy)); + } + // copy column = diff hash + { + auto col_new = std::make_unique<cudf::column>(*col); + auto col_view_copy = col_new->view(); + EXPECT_FALSE(shallow_equal(col_view, col_view_copy)); + } + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + } + // update data + new column_view = same hash. + { + // update data by modifying some bits: fixed_width, string, dict, list, struct + if constexpr (cudf::is_fixed_width<TypeParam>()) { + // Update data + auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } else { + // Update child(0).data + auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + } + // add null_mask + new column_view = diff hash. + { + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); + auto col_view_new = cudf::column_view{*col}; + EXPECT_FALSE(shallow_equal(col_view, col_view_new)); + col_view_new.null_count(); + EXPECT_FALSE(shallow_equal(col_view, col_view_new)); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view_new, col_view_new2)); + } + col_view = cudf::column_view{*col}; // updating after adding null_mask + // update nulls + new column_view = same hash. + { + cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + } + // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) + { + col->set_null_count(cudf::UNKNOWN_NULL_COUNT); + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + col->set_null_count(col->size()); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new2)); + } + + // column_view, diff column = diff hash. + { + auto col_diff = example_column<TypeParam>(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_FALSE(shallow_equal(col_view, col_view_diff)); + } + // column_view, sliced[0, size] = same hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); + EXPECT_TRUE(shallow_equal(col_view, col_sliced[0])); + auto col_split = cudf::split(col_view, {0}); + EXPECT_FALSE(shallow_equal(col_view, col_split[0])); + EXPECT_TRUE(shallow_equal(col_view, col_split[1])); + } + // column_view, sliced[n:] = diff hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); + EXPECT_FALSE(shallow_equal(col_view, col_sliced[0])); + auto col_split = cudf::split(col_view, {1}); + EXPECT_FALSE(shallow_equal(col_view, col_split[0])); + EXPECT_FALSE(shallow_equal(col_view, col_split[1])); + } + // column_view, bit_cast = diff hash + { + if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) { + using newType = std::conditional_t<std::is_signed_v<TypeParam>, + std::make_unsigned_t<TypeParam>, + std::make_signed_t<TypeParam>>; + auto new_type = cudf::data_type(cudf::type_to_id<newType>()); + auto col_bitcast = cudf::bit_cast(col_view, new_type); + EXPECT_FALSE(shallow_equal(col_view, col_bitcast)); + } + } + // mutable_column_view, column_view = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + EXPECT_TRUE(shallow_equal(col_mutable, col_view)); + } + // mutable_column_view, modified mutable_column_view = same hash + // update the children column data = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + if constexpr (cudf::is_fixed_width<TypeParam>()) { + // Update data + auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } else { + // Update child(0).data + auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } + EXPECT_TRUE(shallow_equal(col_view, col_mutable)); + auto col_mutable_new = cudf::mutable_column_view{*col}; + EXPECT_TRUE(shallow_equal(col_mutable, col_mutable_new)); + } + // update the children column_views = diff hash + { + if constexpr (cudf::is_nested<TypeParam>()) { + col->child(0).set_null_mask( + cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); + auto col_child_updated = cudf::mutable_column_view{*col}; + EXPECT_FALSE(shallow_equal(col_view, col_child_updated)); + } + } +}