From 65b64f675d5e87e45b7350782eab88293b633a49 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 26 Jun 2024 11:55:16 -0500 Subject: [PATCH] Fix segfault in conditional join (#16094) Closes #16066. I found a bug that would cause the reported segfault and have fixed it in this PR. When the right table has zero rows, conditional left anti-joins were returning a vector of indices containing garbage data. Along the way, I refactored several parts of the conditional join tests and added coverage for more cases. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/16094 --- cpp/src/join/conditional_join.cu | 13 +--- cpp/tests/join/conditional_join_tests.cu | 92 +++++++++++++++++------- 2 files changed, 70 insertions(+), 35 deletions(-) diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index f02dee5f7f5..97a06d5a923 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -48,8 +48,7 @@ std::unique_ptr> conditional_join_anti_semi( { if (right.num_rows() == 0) { switch (join_type) { - case join_kind::LEFT_ANTI_JOIN: - return std::make_unique>(left.num_rows(), stream, mr); + case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first; case join_kind::LEFT_SEMI_JOIN: return std::make_unique>(0, stream, mr); default: CUDF_FAIL("Invalid join kind."); break; @@ -96,10 +95,6 @@ std::unique_ptr> conditional_join_anti_semi( join_size = size.value(stream); } - if (left.num_rows() == 0) { - return std::make_unique>(0, stream, mr); - } - rmm::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); @@ -149,8 +144,7 @@ conditional_join(table_view const& left, // with a corresponding NULL from the right. case join_kind::LEFT_JOIN: case join_kind::LEFT_ANTI_JOIN: - case join_kind::FULL_JOIN: - return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource()); + case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr); // Inner and left semi joins return empty output because no matches can exist. case join_kind::INNER_JOIN: case join_kind::LEFT_SEMI_JOIN: @@ -169,8 +163,7 @@ conditional_join(table_view const& left, std::make_unique>(0, stream, mr)); // Full joins need to return the trivial complement. case join_kind::FULL_JOIN: { - auto ret_flipped = - get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource()); + auto ret_flipped = get_trivial_left_join_indices(right, stream, mr); return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first)); } default: CUDF_FAIL("Invalid join kind."); break; diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu index 79968bcd7f4..7ab4a2ea465 100644 --- a/cpp/tests/join/conditional_join_tests.cu +++ b/cpp/tests/join/conditional_join_tests.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { std::vector> expected_outputs) { auto result_size = this->join_size(left, right, predicate); - EXPECT_TRUE(result_size == expected_outputs.size()); - - auto result = this->join(left, right, predicate); - std::vector> result_pairs; - for (size_t i = 0; i < result.first->size(); ++i) { - // Note: Not trying to be terribly efficient here since these tests are - // small, otherwise a batch copy to host before constructing the tuples - // would be important. - result_pairs.push_back({result.first->element(i, cudf::get_default_stream()), - result.second->element(i, cudf::get_default_stream())}); - } + EXPECT_EQ(result_size, expected_outputs.size()); + + auto result = this->join(left, right, predicate); + auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream()); + auto rhs_result = + cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream()); + std::vector> result_pairs(lhs_result.size()); + std::transform(lhs_result.begin(), + lhs_result.end(), + rhs_result.begin(), + result_pairs.begin(), + [](cudf::size_type lhs, cudf::size_type rhs) { + return std::pair{lhs, rhs}; + }); std::sort(result_pairs.begin(), result_pairs.end()); std::sort(expected_outputs.begin(), expected_outputs.end()); - EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin())); + EXPECT_TRUE(std::equal( + expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end())); } /* @@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty) this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {}); }; +TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty) +{ + this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {}); +}; + TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual) { this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}}); @@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty) this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {}); }; +TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty) +{ + this->test({{3, 4, 5}}, + {{}}, + left_zero_eq_right_zero, + {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}}); +}; + TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash) { auto [left, right] = gen_random_repeated_columns(); @@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty) {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}}); }; +TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty) +{ + this->test({{3, 4, 5}}, + {{}}, + left_zero_eq_right_zero, + {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}}); +}; + TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual) { this->test({{0, 1, 2}, {10, 20, 30}}, @@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest { auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] = this->parse_input(left_data, right_data); auto result_size = this->join_size(left, right, predicate); - EXPECT_TRUE(result_size == expected_outputs.size()); - - auto result = this->join(left, right, predicate); - std::vector resulting_indices; - for (size_t i = 0; i < result->size(); ++i) { - // Note: Not trying to be terribly efficient here since these tests are - // small, otherwise a batch copy to host before constructing the tuples - // would be important. - resulting_indices.push_back(result->element(i, cudf::get_default_stream())); - } - std::sort(resulting_indices.begin(), resulting_indices.end()); + EXPECT_EQ(result_size, expected_outputs.size()); + + auto result = this->join(left, right, predicate); + auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream()); + std::sort(result_indices.begin(), result_indices.end()); std::sort(expected_outputs.begin(), expected_outputs.end()); - EXPECT_TRUE( - std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin())); + EXPECT_TRUE(std::equal(result_indices.begin(), + result_indices.end(), + expected_outputs.begin(), + expected_outputs.end())); } void _compare_to_hash_join(std::unique_ptr> const& result, @@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest { TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool); +TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty) +{ + this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {}); +}; + +TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty) +{ + this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {}); +}; + TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual) { this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1}); @@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest { TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool); +TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty) +{ + this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {}); +}; + +TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty) +{ + this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2}); +}; + TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual) { this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});