From 2808a5c1d83a17169a204bbfe66a3e6686f038d8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 11 Jan 2021 12:07:11 -0500 Subject: [PATCH 001/138] Add a compute_hash_join_indices that returns just the join indices --- cpp/src/join/hash_join.cu | 28 ++++++++++++++++++++++++---- cpp/src/join/hash_join.cuh | 10 ++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index c2c32d4165a..5fc979941b9 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -573,8 +573,8 @@ std::unique_ptr hash_join::hash_join_impl::full_join( } template -std::pair, std::unique_ptr> -hash_join::hash_join_impl::compute_hash_join( +std::pair, rmm::device_vector> +hash_join::hash_join_impl::compute_hash_join_indices( cudf::table_view const &probe, std::vector const &probe_on, std::vector> const &columns_in_common, @@ -601,7 +601,7 @@ hash_join::hash_join_impl::compute_hash_join( "Invalid values passed to columns_in_common"); if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { - return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); + return std::make_pair(rmm::device_vector{}, rmm::device_vector{}); } auto probe_selected = probe.select(probe_on); @@ -615,7 +615,27 @@ hash_join::hash_join_impl::compute_hash_join( constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) ? cudf::detail::join_kind::LEFT_JOIN : JoinKind; - auto joined_indices = probe_join_indices(probe_selected, compare_nulls, stream); + return probe_join_indices(probe_selected, compare_nulls, stream); +} + +template +std::pair, std::unique_ptr> +hash_join::hash_join_impl::compute_hash_join( + cudf::table_view const &probe, + std::vector const &probe_on, + std::vector> const &columns_in_common, + common_columns_output_side common_columns_output_side, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + auto joined_indices = compute_hash_join_indices( + probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); + + if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { + return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); + } + return cudf::detail::construct_join_output_df( probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr); } diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 712d771bd73..00dfc492260 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -257,6 +257,16 @@ struct hash_join::hash_join_impl { rmm::mr::device_memory_resource* mr) const; private: + template + std::pair, rmm::device_vector> compute_hash_join_indices( + cudf::table_view const& probe, + std::vector const& probe_on, + std::vector> const& columns_in_common, + common_columns_output_side common_columns_output_side, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + /** * @brief Performs hash join by probing the columns provided in `probe` as per * the joining indices given in `probe_on` and returns a (`probe`, `_build`) table pair, which From ef0baee062880f885b0506832e8d4fd50269c747 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 11 Jan 2021 13:06:59 -0500 Subject: [PATCH 002/138] Don't need common_columns stuff for join that returns a gathermap --- cpp/src/join/hash_join.cu | 38 +++++++++++++++++--------------------- cpp/src/join/hash_join.cuh | 4 +--- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 5fc979941b9..109698e3752 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -574,14 +574,11 @@ std::unique_ptr hash_join::hash_join_impl::full_join( template std::pair, rmm::device_vector> -hash_join::hash_join_impl::compute_hash_join_indices( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, @@ -589,17 +586,6 @@ hash_join::hash_join_impl::compute_hash_join_indices( CUDF_EXPECTS(_build_on.size() == probe_on.size(), "Mismatch in number of columns to be joined on"); - CUDF_EXPECTS(std::all_of(columns_in_common.begin(), - columns_in_common.end(), - [this, &probe_on](auto pair) { - size_t p = std::find(probe_on.begin(), probe_on.end(), pair.first) - - probe_on.begin(); - size_t b = std::find(_build_on.begin(), _build_on.end(), pair.second) - - _build_on.begin(); - return (p != probe_on.size()) && (b != _build_on.size()) && (p == b); - }), - "Invalid values passed to columns_in_common"); - if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { return std::make_pair(rmm::device_vector{}, rmm::device_vector{}); } @@ -629,8 +615,18 @@ hash_join::hash_join_impl::compute_hash_join( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) const { - auto joined_indices = compute_hash_join_indices( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); + CUDF_EXPECTS(std::all_of(columns_in_common.begin(), + columns_in_common.end(), + [this, &probe_on](auto pair) { + size_t p = std::find(probe_on.begin(), probe_on.end(), pair.first) - + probe_on.begin(); + size_t b = std::find(_build_on.begin(), _build_on.end(), pair.second) - + _build_on.begin(); + return (p != probe_on.size()) && (b != _build_on.size()) && (p == b); + }), + "Invalid values passed to columns_in_common"); + + auto joined_indices = compute_hash_join(probe, probe_on, compare_nulls, stream, mr); if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 00dfc492260..fb2b52e62e8 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -258,11 +258,9 @@ struct hash_join::hash_join_impl { private: template - std::pair, rmm::device_vector> compute_hash_join_indices( + std::pair, rmm::device_vector> compute_hash_join( cudf::table_view const& probe, std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; From 18f3074bd6bacb03b1de5d74804b767707c1238a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 11 Jan 2021 13:32:09 -0500 Subject: [PATCH 003/138] Add hash_join_impl methods that return gathermaps --- cpp/src/join/hash_join.cu | 36 ++++++++++++++++++++++++++++++++++++ cpp/src/join/hash_join.cuh | 21 +++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 109698e3752..1c19ff9d000 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -515,6 +515,18 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); } +std::pair, rmm::device_vector> +hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + CUDF_FUNC_RANGE(); + return compute_hash_join( + probe, probe_on, compare_nulls, stream, mr); +} + std::pair, std::unique_ptr> hash_join::hash_join_impl::inner_join( cudf::table_view const &probe, @@ -530,6 +542,18 @@ hash_join::hash_join_impl::inner_join( probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } +std::pair, rmm::device_vector> +hash_join::hash_join_impl::left_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + CUDF_FUNC_RANGE(); + return compute_hash_join( + probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::hash_join_impl::left_join( cudf::table_view const &probe, std::vector const &probe_on, @@ -551,6 +575,18 @@ std::unique_ptr hash_join::hash_join_impl::left_join( std::move(probe_build_pair.second)); } +std::pair, rmm::device_vector> +hash_join::hash_join_impl::full_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + CUDF_FUNC_RANGE(); + return compute_hash_join( + probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::hash_join_impl::full_join( cudf::table_view const &probe, std::vector const &probe_on, diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index fb2b52e62e8..34f6d4f262d 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -231,6 +231,13 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); + std::pair, rmm::device_vector> inner_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -240,6 +247,13 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; + std::pair, rmm::device_vector> left_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + std::unique_ptr left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -248,6 +262,13 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; + std::pair, rmm::device_vector> full_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + std::unique_ptr full_join( cudf::table_view const& probe, std::vector const& probe_on, From 70abf483c5a0202fbf90ee7b956060158692a11b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 11 Jan 2021 13:52:06 -0500 Subject: [PATCH 004/138] Add overloads to public hash_join class --- cpp/include/cudf/join.hpp | 22 ++++++++++++++++++++++ cpp/src/join/join.cu | 30 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b2c1296ccef..cbe984947d1 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -20,6 +20,7 @@ #include #include +#include #include @@ -412,6 +413,13 @@ class hash_join { ///< `inner_join`. }; + std::pair, rmm::device_vector> inner_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** * @brief Performs an inner join by probing in the internal hash table. * @@ -456,6 +464,13 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair, rmm::device_vector> left_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** * @brief Performs a left join by probing in the internal hash table. * @@ -486,6 +501,13 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair, rmm::device_vector> full_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** * @brief Performs a full join by probing in the internal hash table. * diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index ce27cfcd616..87bd9786f8c 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -139,6 +139,16 @@ hash_join::hash_join(cudf::table_view const& build, { } +std::pair, rmm::device_vector> hash_join::inner_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return impl->inner_join(probe, probe_on, compare_nulls, stream, mr); +} + std::pair, std::unique_ptr> hash_join::inner_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -152,6 +162,16 @@ std::pair, std::unique_ptr> hash_join: probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } +std::pair, rmm::device_vector> hash_join::left_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return impl->left_join(probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -163,6 +183,16 @@ std::unique_ptr hash_join::left_join( return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } +std::pair, rmm::device_vector> hash_join::full_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return impl->full_join(probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::full_join( cudf::table_view const& probe, std::vector const& probe_on, From 13dff676523ea8ca91d00bf462b2a54a0dd0b9bc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 11 Jan 2021 17:25:33 -0500 Subject: [PATCH 005/138] Add top-level join APIs that return gathermaps --- cpp/include/cudf/join.hpp | 25 +++++++++++++++++++++++++ cpp/tests/join/join_tests.cpp | 12 ++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cbe984947d1..91627321f59 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -31,6 +31,14 @@ namespace cudf { * @file */ +std::pair, rmm::device_vector> inner_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs an inner join on the specified columns of two * tables (`left`, `right`) @@ -97,6 +105,14 @@ std::unique_ptr inner_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair, rmm::device_vector> left_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left join (also known as left outer join) on the * specified columns of two tables (`left`, `right`) @@ -165,6 +181,14 @@ std::unique_ptr left_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair, rmm::device_vector> full_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a full join (also known as full outer join) on the * specified columns of two tables (`left`, `right`) @@ -232,6 +256,7 @@ std::unique_ptr full_join( std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left semi join on the specified columns of two * tables (`left`, `right`) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index efc5330ea7d..b3b86e5cb66 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -101,7 +101,8 @@ TEST_F(JoinTest, FullJoinNoCommon) exp_cols.push_back(exp_col0_1.release()); Table gold(std::move(exp_cols)); - auto result = cudf::full_join(t0, t1, {0}, {0}, {}); + auto result = + cudf::full_join(t0, t1, {0}, {0}, std::vector>{}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -131,7 +132,8 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0}, {0}, {}); + auto result = + cudf::left_join(t0, t1, {0}, {0}, std::vector>{}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -1262,7 +1264,8 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls) auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); { - auto result = cudf::left_join(t0, t1, {0}, {0}, {}); + auto result = + cudf::left_join(t0, t1, {0}, {0}, std::vector>{}); auto result_view = result->view(); auto decoded1 = cudf::dictionary::decode(result_view.column(1)); auto decoded4 = cudf::dictionary::decode(result_view.column(4)); @@ -1273,7 +1276,8 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls) decoded4->view(), result_view.column(5)}); - auto gold = cudf::left_join(g0, g1, {0}, {0}, {}); + auto gold = + cudf::left_join(g0, g1, {0}, {0}, std::vector>{}); CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); } { From 7ed694c00f2a9bd0214be21fc8f71da51c8489c8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 12 Jan 2021 12:47:31 -0500 Subject: [PATCH 006/138] Use device_uvector instead of device_vector in join --- cpp/include/cudf/join.hpp | 14 ++--- cpp/src/join/hash_join.cu | 91 ++++++++++++++++-------------- cpp/src/join/hash_join.cuh | 18 +++--- cpp/src/join/join.cu | 6 +- cpp/src/join/join_common_utils.hpp | 4 +- 5 files changed, 71 insertions(+), 62 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 91627321f59..2f1b1a1ab5b 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include @@ -31,7 +31,7 @@ namespace cudf { * @file */ -std::pair, rmm::device_vector> inner_join( +std::pair, rmm::device_uvector> inner_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -105,7 +105,7 @@ std::unique_ptr inner_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::pair, rmm::device_vector> left_join( +std::pair, rmm::device_uvector> left_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -181,7 +181,7 @@ std::unique_ptr left_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::pair, rmm::device_vector> full_join( +std::pair, rmm::device_uvector> full_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -438,7 +438,7 @@ class hash_join { ///< `inner_join`. }; - std::pair, rmm::device_vector> inner_join( + std::pair, rmm::device_uvector> inner_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, @@ -489,7 +489,7 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - std::pair, rmm::device_vector> left_join( + std::pair, rmm::device_uvector> left_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, @@ -526,7 +526,7 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - std::pair, rmm::device_vector> full_join( + std::pair, rmm::device_uvector> full_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 1c19ff9d000..426ed873df7 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -20,9 +20,10 @@ #include #include -#include +#include #include +#include #include namespace cudf { @@ -90,23 +91,25 @@ std::pair, std::unique_ptr> get_empty_joined_table return std::make_pair(std::move(empty_probe), std::move(empty_build)); } -VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b) +VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream) { CUDF_EXPECTS((a.first.size() == a.second.size()), "Mismatch between sizes of vectors in vector pair"); CUDF_EXPECTS((b.first.size() == b.second.size()), "Mismatch between sizes of vectors in vector pair"); - if (a.first.empty()) { - return b; - } else if (b.first.empty()) { - return a; + if (a.first.is_empty()) { + return std::move(b); + } else if (b.first.is_empty()) { + return std::move(a); } auto original_size = a.first.size(); - a.first.resize(a.first.size() + b.first.size()); - a.second.resize(a.second.size() + b.second.size()); - thrust::copy(b.first.begin(), b.first.end(), a.first.begin() + original_size); - thrust::copy(b.second.begin(), b.second.end(), a.second.begin() + original_size); - return a; + a.first.resize(a.first.size() + b.first.size(), stream); + a.second.resize(a.second.size() + b.second.size(), stream); + thrust::copy( + rmm::exec_policy(stream), b.first.begin(), b.first.end(), a.first.begin() + original_size); + thrust::copy( + rmm::exec_policy(stream), b.second.begin(), b.second.end(), a.second.begin() + original_size); + return std::move(a); } template @@ -133,8 +136,8 @@ struct valid_range { * * @return Pair of vectors containing the left join indices complement */ -std::pair, rmm::device_vector> -get_left_join_indices_complement(rmm::device_vector &right_indices, +std::pair, rmm::device_uvector> +get_left_join_indices_complement(rmm::device_uvector &right_indices, size_type left_table_row_count, size_type right_table_row_count, rmm::cuda_stream_view stream) @@ -142,7 +145,7 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, // Get array of indices that do not appear in right_indices // Vector allocated for unmatched result - rmm::device_vector right_indices_complement(right_table_row_count); + rmm::device_uvector right_indices_complement(right_table_row_count, stream); // If left table is empty in a full join call then all rows of the right table // should be represented in the joined indices. This is an optimization since @@ -180,11 +183,14 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, right_indices_complement.begin(), thrust::identity()) - right_indices_complement.begin(); - right_indices_complement.resize(indices_count); + right_indices_complement.resize(indices_count, stream); } - rmm::device_vector left_invalid_indices(right_indices_complement.size(), - JoinNoneValue); + rmm::device_uvector left_invalid_indices(right_indices_complement.size(), stream); + thrust::fill(rmm::exec_policy(stream), + left_invalid_indices.begin(), + left_invalid_indices.end(), + JoinNoneValue); return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); } @@ -256,7 +262,7 @@ std::unique_ptr> build_join_ * @return Join output indices vector pair. */ template -std::pair, rmm::device_vector> probe_join_hash_table( +std::pair, rmm::device_uvector> probe_join_hash_table( cudf::table_device_view build_table, cudf::table_device_view probe_table, multimap_type const &hash_table, @@ -268,7 +274,8 @@ std::pair, rmm::device_vector> probe_jo // If the estimated output size is zero, return immediately if (estimated_size == 0) { - return std::make_pair(rmm::device_vector{}, rmm::device_vector{}); + return std::make_pair(rmm::device_uvector{0, stream}, + rmm::device_uvector{0, stream}); } // Because we are approximating the number of joined elements, our approximation @@ -278,12 +285,12 @@ std::pair, rmm::device_vector> probe_jo rmm::device_scalar write_index(0, stream); size_type join_size{0}; - rmm::device_vector left_indices; - rmm::device_vector right_indices; + rmm::device_uvector left_indices{0, stream}; + rmm::device_uvector right_indices{0, stream}; auto current_estimated_size = estimated_size; do { - left_indices.resize(estimated_size); - right_indices.resize(estimated_size); + left_indices.resize(estimated_size, stream); + right_indices.resize(estimated_size, stream); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(probe_table.num_rows(), block_size); @@ -292,16 +299,15 @@ std::pair, rmm::device_vector> probe_jo row_hash hash_probe{probe_table}; row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; probe_hash_table - <<>>( - hash_table, - build_table, - probe_table, - hash_probe, - equality, - left_indices.data().get(), - right_indices.data().get(), - write_index.data(), - estimated_size); + <<>>(hash_table, + build_table, + probe_table, + hash_probe, + equality, + left_indices.data(), + right_indices.data(), + write_index.data(), + estimated_size); CHECK_CUDA(stream.value()); @@ -310,8 +316,8 @@ std::pair, rmm::device_vector> probe_jo estimated_size *= 2; } while ((current_estimated_size < join_size)); - left_indices.resize(join_size); - right_indices.resize(join_size); + left_indices.resize(join_size, stream); + right_indices.resize(join_size, stream); return std::make_pair(std::move(left_indices), std::move(right_indices)); } @@ -444,7 +450,7 @@ std::pair, std::unique_ptr
> construct_join_output_ common_table = cudf::detail::concatenate( {common_from_build->view(), common_from_probe->view()}, stream, mr); } - joined_indices = concatenate_vector_pairs(complement_indices, joined_indices); + joined_indices = concatenate_vector_pairs(complement_indices, joined_indices, stream); } else { if (not columns_in_common.empty()) { common_table = detail::gather(probe.select(probe_common_col), @@ -515,7 +521,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); } -std::pair, rmm::device_vector> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, std::vector const &probe_on, null_equality compare_nulls, @@ -542,7 +548,7 @@ hash_join::hash_join_impl::inner_join( probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } -std::pair, rmm::device_vector> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::left_join(cudf::table_view const &probe, std::vector const &probe_on, null_equality compare_nulls, @@ -575,7 +581,7 @@ std::unique_ptr hash_join::hash_join_impl::left_join( std::move(probe_build_pair.second)); } -std::pair, rmm::device_vector> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::full_join(cudf::table_view const &probe, std::vector const &probe_on, null_equality compare_nulls, @@ -609,7 +615,7 @@ std::unique_ptr hash_join::hash_join_impl::full_join( } template -std::pair, rmm::device_vector> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, std::vector const &probe_on, null_equality compare_nulls, @@ -623,7 +629,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, "Mismatch in number of columns to be joined on"); if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { - return std::make_pair(rmm::device_vector{}, rmm::device_vector{}); + return std::make_pair(rmm::device_uvector{0, stream}, + rmm::device_uvector{0, stream}); } auto probe_selected = probe.select(probe_on); @@ -674,7 +681,7 @@ hash_join::hash_join_impl::compute_hash_join( template std::enable_if_t, rmm::device_vector>> + std::pair, rmm::device_uvector>> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 34f6d4f262d..fed5080d2be 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include @@ -181,12 +181,12 @@ size_type estimate_join_output_size(table_device_view build_table, * * @return Join output indices vector pair */ -inline std::pair, rmm::device_vector> +inline std::pair, rmm::device_uvector> get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream) { - rmm::device_vector left_indices(left.num_rows()); + rmm::device_uvector left_indices(left.num_rows(), stream); thrust::sequence(rmm::exec_policy(stream), left_indices.begin(), left_indices.end(), 0); - rmm::device_vector right_indices(left.num_rows()); + rmm::device_uvector right_indices(left.num_rows(), stream); thrust::fill(rmm::exec_policy(stream), right_indices.begin(), right_indices.end(), JoinNoneValue); return std::make_pair(std::move(left_indices), std::move(right_indices)); } @@ -231,7 +231,7 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - std::pair, rmm::device_vector> inner_join( + std::pair, rmm::device_uvector> inner_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, @@ -247,7 +247,7 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::pair, rmm::device_vector> left_join( + std::pair, rmm::device_uvector> left_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, @@ -262,7 +262,7 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::pair, rmm::device_vector> full_join( + std::pair, rmm::device_uvector> full_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, @@ -279,7 +279,7 @@ struct hash_join::hash_join_impl { private: template - std::pair, rmm::device_vector> compute_hash_join( + std::pair, rmm::device_uvector> compute_hash_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, @@ -354,7 +354,7 @@ struct hash_join::hash_join_impl { */ template std::enable_if_t, rmm::device_vector>> + std::pair, rmm::device_uvector>> probe_join_indices(cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const; diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 87bd9786f8c..08af40405ec 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -139,7 +139,7 @@ hash_join::hash_join(cudf::table_view const& build, { } -std::pair, rmm::device_vector> hash_join::inner_join( +std::pair, rmm::device_uvector> hash_join::inner_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, @@ -162,7 +162,7 @@ std::pair, std::unique_ptr> hash_join: probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } -std::pair, rmm::device_vector> hash_join::left_join( +std::pair, rmm::device_uvector> hash_join::left_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, @@ -183,7 +183,7 @@ std::unique_ptr hash_join::left_join( return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } -std::pair, rmm::device_vector> hash_join::full_join( +std::pair, rmm::device_uvector> hash_join::full_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls, diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index f0c158c1ef6..9965fa496aa 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -19,6 +19,8 @@ #include #include +#include + #include #include @@ -31,7 +33,7 @@ constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; constexpr size_type JoinNoneValue = -1; -using VectorPair = std::pair, rmm::device_vector>; +using VectorPair = std::pair, rmm::device_uvector>; using multimap_type = concurrent_unordered_multimap Date: Tue, 12 Jan 2021 15:11:28 -0500 Subject: [PATCH 007/138] Undo some API changes --- cpp/include/cudf/join.hpp | 45 ---------------------------------- cpp/src/join/hash_join.cu | 49 ++++++-------------------------------- cpp/src/join/hash_join.cuh | 33 +++++-------------------- cpp/src/join/join.cu | 30 ----------------------- 4 files changed, 13 insertions(+), 144 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 2f1b1a1ab5b..2b8fc4bb2cb 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -31,14 +31,6 @@ namespace cudf { * @file */ -std::pair, rmm::device_uvector> inner_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Performs an inner join on the specified columns of two * tables (`left`, `right`) @@ -105,14 +97,6 @@ std::unique_ptr inner_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::pair, rmm::device_uvector> left_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Performs a left join (also known as left outer join) on the * specified columns of two tables (`left`, `right`) @@ -181,14 +165,6 @@ std::unique_ptr left_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::pair, rmm::device_uvector> full_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Performs a full join (also known as full outer join) on the * specified columns of two tables (`left`, `right`) @@ -438,13 +414,6 @@ class hash_join { ///< `inner_join`. }; - std::pair, rmm::device_uvector> inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** * @brief Performs an inner join by probing in the internal hash table. * @@ -489,13 +458,6 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - std::pair, rmm::device_uvector> left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** * @brief Performs a left join by probing in the internal hash table. * @@ -526,13 +488,6 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - std::pair, rmm::device_uvector> full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** * @brief Performs a full join by probing in the internal hash table. * diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 426ed873df7..d2d26dbc3b7 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -521,18 +521,6 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); } -std::pair, rmm::device_uvector> -hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, compare_nulls, stream, mr); -} - std::pair, std::unique_ptr> hash_join::hash_join_impl::inner_join( cudf::table_view const &probe, @@ -548,18 +536,6 @@ hash_join::hash_join_impl::inner_join( probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> -hash_join::hash_join_impl::left_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, compare_nulls, stream, mr); -} - std::unique_ptr hash_join::hash_join_impl::left_join( cudf::table_view const &probe, std::vector const &probe_on, @@ -581,18 +557,6 @@ std::unique_ptr hash_join::hash_join_impl::left_join( std::move(probe_build_pair.second)); } -std::pair, rmm::device_uvector> -hash_join::hash_join_impl::full_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, compare_nulls, stream, mr); -} - std::unique_ptr hash_join::hash_join_impl::full_join( cudf::table_view const &probe, std::vector const &probe_on, @@ -616,11 +580,11 @@ std::unique_ptr hash_join::hash_join_impl::full_join( template std::pair, rmm::device_uvector> -hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, @@ -669,7 +633,8 @@ hash_join::hash_join_impl::compute_hash_join( }), "Invalid values passed to columns_in_common"); - auto joined_indices = compute_hash_join(probe, probe_on, compare_nulls, stream, mr); + auto joined_indices = + compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index fed5080d2be..a222a932edd 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -231,13 +231,6 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - std::pair, rmm::device_uvector> inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -247,13 +240,6 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::pair, rmm::device_uvector> left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - std::unique_ptr left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -262,13 +248,6 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::pair, rmm::device_uvector> full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - std::unique_ptr full_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -279,12 +258,12 @@ struct hash_join::hash_join_impl { private: template - std::pair, rmm::device_uvector> compute_hash_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair, rmm::device_uvector> + compute_hash_join_indices(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; /** * @brief Performs hash join by probing the columns provided in `probe` as per diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 08af40405ec..ce27cfcd616 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -139,16 +139,6 @@ hash_join::hash_join(cudf::table_view const& build, { } -std::pair, rmm::device_uvector> hash_join::inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->inner_join(probe, probe_on, compare_nulls, stream, mr); -} - std::pair, std::unique_ptr> hash_join::inner_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -162,16 +152,6 @@ std::pair, std::unique_ptr> hash_join: probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> hash_join::left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->left_join(probe, probe_on, compare_nulls, stream, mr); -} - std::unique_ptr hash_join::left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -183,16 +163,6 @@ std::unique_ptr hash_join::left_join( return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> hash_join::full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->full_join(probe, probe_on, compare_nulls, stream, mr); -} - std::unique_ptr hash_join::full_join( cudf::table_view const& probe, std::vector const& probe_on, From b79da68d8f8b551f95eec4a35ac054b9c8b6842f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 13 Jan 2021 07:39:53 -0500 Subject: [PATCH 008/138] Add join_result --- cpp/include/cudf/join.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 2b8fc4bb2cb..ccf983bc72f 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -31,6 +31,27 @@ namespace cudf { * @file */ +/** + * @brief The result of a `join`. + * + * A `join_result` holds two columns containing the + * left and right gathermaps. + */ // TODO: explain this better +struct join_result { + column_view left_indices; /// < The left gathermap + column_view right_indices; /// < The right gathermap + std::unique_ptr left_buf; + std::unique_ptr right_buf; +}; + +std::unique_ptr inner_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs an inner join on the specified columns of two * tables (`left`, `right`) From 380aa5908b9e6cab466cd581f83cf23d5c0605e6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 13 Jan 2021 08:08:16 -0500 Subject: [PATCH 009/138] Add APIs that return join_result --- cpp/include/cudf/join.hpp | 69 ++++++++++++++++++++++++++++++++++++-- cpp/src/join/join.cu | 70 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index ccf983bc72f..b644ee5c049 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -36,7 +36,7 @@ namespace cudf { * * A `join_result` holds two columns containing the * left and right gathermaps. - */ // TODO: explain this better + */ // TODO: explain this better struct join_result { column_view left_indices; /// < The left gathermap column_view right_indices; /// < The right gathermap @@ -44,7 +44,12 @@ struct join_result { std::unique_ptr right_buf; }; -std::unique_ptr inner_join( +/** + * @brief Performs an inner join on the specified columns of two + * tables (`left`, `right`), and returns the row indices corresponding + * to the result. + */ // TODO: explain this better +join_result inner_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -118,6 +123,18 @@ std::unique_ptr inner_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Performs a left join on the specified columns of two + * tables (`left`, `right`), and returns the row indices corresponding + * to the result. + */ // TODO: explain this better +join_result left_join(cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left join (also known as left outer join) on the * specified columns of two tables (`left`, `right`) @@ -186,6 +203,18 @@ std::unique_ptr left_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Performs a left join on the specified columns of two + * tables (`left`, `right`), and returns the row indices corresponding + * to the result. + */ // TODO: explain this better +join_result full_join(cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a full join (also known as full outer join) on the * specified columns of two tables (`left`, `right`) @@ -435,6 +464,18 @@ class hash_join { ///< `inner_join`. }; + /** + * @brief Performs an inner join on the specified columns of two + * tables (`left`, `right`), and returns the row indices corresponding + * to the result. + */ // TODO: explain this better + join_result inner_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** * @brief Performs an inner join by probing in the internal hash table. * @@ -479,6 +520,18 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** + * @brief Performs a left join on the specified columns of two + * tables (`left`, `right`), and returns the row indices corresponding + * to the result. + */ // TODO: explain this better + join_result left_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** * @brief Performs a left join by probing in the internal hash table. * @@ -509,6 +562,18 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** + * @brief Performs a full join on the specified columns of two + * tables (`left`, `right`), and returns the row indices corresponding + * to the result. + */ // TODO: explain this better + join_result full_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** * @brief Performs a full join by probing in the internal hash table. * diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index ce27cfcd616..488adb45920 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -26,6 +26,17 @@ namespace cudf { namespace detail { +join_result inner_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return cudf::join_result{}; +} + std::unique_ptr
inner_join( table_view const& left_input, table_view const& right_input, @@ -79,6 +90,17 @@ std::unique_ptr
inner_join( } } +join_result left_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return cudf::join_result{}; +} + std::unique_ptr
left_join( table_view const& left_input, table_view const& right_input, @@ -103,6 +125,18 @@ std::unique_ptr
left_join( return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr); } +join_result full_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + std::vector> const& columns_in_common, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return cudf::join_result{}; +} + std::unique_ptr
full_join( table_view const& left_input, table_view const& right_input, @@ -176,6 +210,18 @@ std::unique_ptr hash_join::full_join( // external APIs +join_result inner_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::inner_join( + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + std::unique_ptr
inner_join( table_view const& left, table_view const& right, @@ -190,6 +236,18 @@ std::unique_ptr
inner_join( left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } +join_result left_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::left_join( + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + std::unique_ptr
left_join( table_view const& left, table_view const& right, @@ -204,6 +262,18 @@ std::unique_ptr
left_join( left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } +join_result full_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::full_join( + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + std::unique_ptr
full_join( table_view const& left, table_view const& right, From 3cbb2b47a5e5967462d54a71d2312bb4c0249c40 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 13 Jan 2021 08:08:29 -0500 Subject: [PATCH 010/138] Remove column_in_common --- cpp/src/join/join.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 488adb45920..717cd4b824e 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -129,7 +129,6 @@ join_result full_join(table_view const& left_input, table_view const& right_input, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) From 53ae7c90e0e6e45696d067fb48f7326043529b93 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 14 Jan 2021 17:52:41 -0500 Subject: [PATCH 011/138] Add an inner join API that returns gathermaps --- cpp/src/join/hash_join.cu | 35 +++++++++++++++++++++++++++++++++++ cpp/src/join/hash_join.cuh | 13 +++++++++++++ cpp/src/join/join.cu | 31 ++++++++++++++++++++++++++++++- 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index d2d26dbc3b7..74037ed91c0 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -521,6 +521,17 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); } +join_result hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + CUDF_FUNC_RANGE(); + return compute_hash_join( + probe, probe_on, compare_nulls, stream, mr); +} + std::pair, std::unique_ptr> hash_join::hash_join_impl::inner_join( cudf::table_view const &probe, @@ -611,6 +622,30 @@ hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &pro return probe_join_indices(probe_selected, compare_nulls, stream); } +template +join_result hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + auto join_indices = + compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); + auto left_map = cudf::column_view(cudf::data_type(type_to_id()), + join_indices.second.size(), + join_indices.second.data(), + nullptr, + 0); + auto right_map = cudf::column_view(cudf::data_type(type_to_id()), + join_indices.second.size(), + join_indices.second.data(), + nullptr, + 0); + auto left_buf = std::make_unique(join_indices.first.release()); + auto right_buf = std::make_unique(join_indices.first.release()); + return join_result{left_map, right_map, std::move(left_buf), std::move(right_buf)}; +} + template std::pair, std::unique_ptr> hash_join::hash_join_impl::compute_hash_join( diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index a222a932edd..41247dcbecb 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -231,6 +231,12 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); + join_result inner_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -265,6 +271,13 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; + template + join_result compute_hash_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + /** * @brief Performs hash join by probing the columns provided in `probe` as per * the joining indices given in `probe_on` and returns a (`probe`, `_build`) table pair, which diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 717cd4b824e..8808473f4c3 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -34,7 +34,27 @@ join_result inner_join(table_view const& left_input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::join_result{}; + // Make sure any dictionary columns have matched key sets. + // This will return any new dictionary columns created as well as updated table_views. + auto matched = cudf::dictionary::detail::match_dictionaries( + {left_input.select(left_on), right_input.select(right_on)}, + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + + // now rebuild the table views with the updated ones + auto const left = scatter_columns(matched.second.front(), left_on, left_input); + auto const right = scatter_columns(matched.second.back(), right_on, right_input); + + // For `inner_join`, we can freely choose either the `left` or `right` table to use for + // building/probing the hash map. Because building is typically more expensive than probing, we + // build the hash map from the smaller table. + if (right.num_rows() > left.num_rows()) { + cudf::hash_join hj_obj(left, left_on, compare_nulls, stream); + return hj_obj.inner_join(right, right_on, compare_nulls, stream, mr); + } else { + cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); + return hj_obj.inner_join(left, left_on, compare_nulls, stream, mr); + } } std::unique_ptr
inner_join( @@ -172,6 +192,15 @@ hash_join::hash_join(cudf::table_view const& build, { } +join_result hash_join::inner_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return impl->inner_join(probe, probe_on, compare_nulls, stream, mr); +} + std::pair, std::unique_ptr> hash_join::inner_join( cudf::table_view const& probe, std::vector const& probe_on, From fde172b9224f15997e02b4b015abd601b0d9d1da Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 14 Jan 2021 18:03:19 -0500 Subject: [PATCH 012/138] Add remaining APIs to return gathermaps --- cpp/src/join/hash_join.cu | 22 +++++++++++++++++++ cpp/src/join/hash_join.cuh | 12 +++++++++++ cpp/src/join/join.cu | 44 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 74037ed91c0..363a4e8c1ca 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -547,6 +547,17 @@ hash_join::hash_join_impl::inner_join( probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } +join_result hash_join::hash_join_impl::left_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + CUDF_FUNC_RANGE(); + return compute_hash_join( + probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::hash_join_impl::left_join( cudf::table_view const &probe, std::vector const &probe_on, @@ -568,6 +579,17 @@ std::unique_ptr hash_join::hash_join_impl::left_join( std::move(probe_build_pair.second)); } +join_result hash_join::hash_join_impl::full_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const +{ + CUDF_FUNC_RANGE(); + return compute_hash_join( + probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::hash_join_impl::full_join( cudf::table_view const &probe, std::vector const &probe_on, diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 41247dcbecb..7bc49bf8683 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -246,6 +246,12 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; + join_result left_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + std::unique_ptr left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -254,6 +260,12 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; + join_result full_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + std::unique_ptr full_join( cudf::table_view const& probe, std::vector const& probe_on, diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 8808473f4c3..75bda62d809 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -118,7 +118,18 @@ join_result left_join(table_view const& left_input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::join_result{}; + // Make sure any dictionary columns have matched key sets. + // This will return any new dictionary columns created as well as updated table_views. + auto matched = cudf::dictionary::detail::match_dictionaries( + {left_input.select(left_on), right_input.select(right_on)}, // these should match + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + // now rebuild the table views with the updated ones + table_view const left = scatter_columns(matched.second.front(), left_on, left_input); + table_view const right = scatter_columns(matched.second.back(), right_on, right_input); + + cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); + return hj_obj.left_join(left, left_on, compare_nulls, stream, mr); } std::unique_ptr
left_join( @@ -153,7 +164,18 @@ join_result full_join(table_view const& left_input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::join_result{}; + // Make sure any dictionary columns have matched key sets. + // This will return any new dictionary columns created as well as updated table_views. + auto matched = cudf::dictionary::detail::match_dictionaries( + {left_input.select(left_on), right_input.select(right_on)}, // these should match + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + // now rebuild the table views with the updated ones + table_view const left = scatter_columns(matched.second.front(), left_on, left_input); + table_view const right = scatter_columns(matched.second.back(), right_on, right_input); + + cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); + return hj_obj.full_join(left, left_on, compare_nulls, stream, mr); } std::unique_ptr
full_join( @@ -214,6 +236,15 @@ std::pair, std::unique_ptr> hash_join: probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } +join_result hash_join::left_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return impl->left_join(probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -225,6 +256,15 @@ std::unique_ptr hash_join::left_join( return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } +join_result hash_join::full_join(cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return impl->full_join(probe, probe_on, compare_nulls, stream, mr); +} + std::unique_ptr hash_join::full_join( cudf::table_view const& probe, std::vector const& probe_on, From 4a286dd164a01f0605e5ddf07de1aee2a416c925 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 18 Jan 2021 15:16:22 -0500 Subject: [PATCH 013/138] Add gathermap join test --- cpp/src/join/hash_join.cu | 6 +++--- cpp/tests/join/join_tests.cpp | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 363a4e8c1ca..b8f896a49b4 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -654,8 +654,8 @@ join_result hash_join::hash_join_impl::compute_hash_join(cudf::table_view const auto join_indices = compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); auto left_map = cudf::column_view(cudf::data_type(type_to_id()), - join_indices.second.size(), - join_indices.second.data(), + join_indices.first.size(), + join_indices.first.data(), nullptr, 0); auto right_map = cudf::column_view(cudf::data_type(type_to_id()), @@ -664,7 +664,7 @@ join_result hash_join::hash_join_impl::compute_hash_join(cudf::table_view const nullptr, 0); auto left_buf = std::make_unique(join_indices.first.release()); - auto right_buf = std::make_unique(join_indices.first.release()); + auto right_buf = std::make_unique(join_indices.second.release()); return join_result{left_map, right_map, std::move(left_buf), std::move(right_buf)}; } diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index b3b86e5cb66..785a414d418 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1429,4 +1429,37 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls) CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); } +TEST_F(JoinTest, InnerJoinNoNullsGathermap) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.left_indices})); + auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.right_indices})); + auto lmap_sorted = cudf::gather(cudf::table_view({result.left_indices}), *lmap_sort_order); + auto rmap_sorted = cudf::gather(cudf::table_view({result.right_indices}), *rmap_sort_order); + + column_wrapper lmap_gold{{0, 2, 4}}; + column_wrapper rmap_gold{{1, 1, 4}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); +} + CUDF_TEST_PROGRAM_MAIN() From c756db91175031f523bf9c03dbabbf0c346b1a10 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 18 Jan 2021 18:30:09 -0500 Subject: [PATCH 014/138] Replace -1 with INT_MIN --- cpp/src/join/join_common_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 9965fa496aa..917bcb9bdd5 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -31,7 +31,7 @@ constexpr size_type MAX_JOIN_SIZE{std::numeric_limits::max()}; constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; -constexpr size_type JoinNoneValue = -1; +constexpr size_type JoinNoneValue = std::numeric_limits::min(); using VectorPair = std::pair, rmm::device_uvector>; From 6a3d23e8fd89e14af9cc131cb005df8c110ba424 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Jan 2021 09:16:11 -0500 Subject: [PATCH 015/138] Make join_result columns instead of column_views --- cpp/include/cudf/join.hpp | 6 ++---- cpp/src/join/hash_join.cu | 25 ++++++++++++------------- cpp/tests/join/join_tests.cpp | 10 ++++++---- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b644ee5c049..42c42e2ce7e 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -38,10 +38,8 @@ namespace cudf { * left and right gathermaps. */ // TODO: explain this better struct join_result { - column_view left_indices; /// < The left gathermap - column_view right_indices; /// < The right gathermap - std::unique_ptr left_buf; - std::unique_ptr right_buf; + std::unique_ptr left_indices; /// < The left gathermap + std::unique_ptr right_indices; /// < The left gathermap }; /** diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index b8f896a49b4..ca875f99bef 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -653,19 +653,18 @@ join_result hash_join::hash_join_impl::compute_hash_join(cudf::table_view const { auto join_indices = compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); - auto left_map = cudf::column_view(cudf::data_type(type_to_id()), - join_indices.first.size(), - join_indices.first.data(), - nullptr, - 0); - auto right_map = cudf::column_view(cudf::data_type(type_to_id()), - join_indices.second.size(), - join_indices.second.data(), - nullptr, - 0); - auto left_buf = std::make_unique(join_indices.first.release()); - auto right_buf = std::make_unique(join_indices.second.release()); - return join_result{left_map, right_map, std::move(left_buf), std::move(right_buf)}; + auto join_size = join_indices.first.size(); + auto left_map = std::make_unique(cudf::data_type(type_to_id()), + join_size, + join_indices.first.release(), + rmm::device_buffer{}, + 0); + auto right_map = std::make_unique(cudf::data_type(type_to_id()), + join_size, + join_indices.second.release(), + rmm::device_buffer{}, + 0); + return join_result{std::move(left_map), std::move(right_map)}; } template diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 785a414d418..5b5b4253961 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1451,10 +1451,12 @@ TEST_F(JoinTest, InnerJoinNoNullsGathermap) Table t1(std::move(cols1)); auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); - auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.left_indices})); - auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.right_indices})); - auto lmap_sorted = cudf::gather(cudf::table_view({result.left_indices}), *lmap_sort_order); - auto rmap_sorted = cudf::gather(cudf::table_view({result.right_indices}), *rmap_sort_order); + auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.left_indices->view()})); + auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.right_indices->view()})); + auto lmap_sorted = + cudf::gather(cudf::table_view({result.left_indices->view()}), *lmap_sort_order); + auto rmap_sorted = + cudf::gather(cudf::table_view({result.right_indices->view()}), *rmap_sort_order); column_wrapper lmap_gold{{0, 2, 4}}; column_wrapper rmap_gold{{1, 1, 4}}; From 5dfc2a0cdfaceb742b7a230176d031312e1b4da7 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Jan 2021 09:41:37 -0500 Subject: [PATCH 016/138] Replace join_result with a pair of columns --- cpp/include/cudf/join.hpp | 45 ++++++------- cpp/src/join/hash_join.cu | 47 ++++++++------ cpp/src/join/hash_join.cuh | 44 +++++++------ cpp/src/join/join.cu | 117 ++++++++++++++++++---------------- cpp/tests/join/join_tests.cpp | 10 ++- 5 files changed, 135 insertions(+), 128 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 42c42e2ce7e..9de03031a37 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -31,23 +31,12 @@ namespace cudf { * @file */ -/** - * @brief The result of a `join`. - * - * A `join_result` holds two columns containing the - * left and right gathermaps. - */ // TODO: explain this better -struct join_result { - std::unique_ptr left_indices; /// < The left gathermap - std::unique_ptr right_indices; /// < The left gathermap -}; - /** * @brief Performs an inner join on the specified columns of two * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better -join_result inner_join( +std::pair, std::unique_ptr> inner_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -126,12 +115,13 @@ std::unique_ptr inner_join( * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better -join_result left_join(cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair, std::unique_ptr> left_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a left join (also known as left outer join) on the @@ -206,12 +196,13 @@ std::unique_ptr left_join( * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better -join_result full_join(cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair, std::unique_ptr> full_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a full join (also known as full outer join) on the @@ -467,7 +458,7 @@ class hash_join { * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better - join_result inner_join( + std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, @@ -523,7 +514,7 @@ class hash_join { * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better - join_result left_join( + std::pair, std::unique_ptr> left_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, @@ -565,7 +556,7 @@ class hash_join { * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better - join_result full_join( + std::pair, std::unique_ptr> full_join( cudf::table_view const& probe, std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index ca875f99bef..e7c8f1959d3 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -521,11 +521,12 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); } -join_result hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair, std::unique_ptr> +hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( @@ -547,11 +548,12 @@ hash_join::hash_join_impl::inner_join( probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } -join_result hash_join::hash_join_impl::left_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair, std::unique_ptr> +hash_join::hash_join_impl::left_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( @@ -579,11 +581,12 @@ std::unique_ptr hash_join::hash_join_impl::left_join( std::move(probe_build_pair.second)); } -join_result hash_join::hash_join_impl::full_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair, std::unique_ptr> +hash_join::hash_join_impl::full_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( @@ -645,11 +648,12 @@ hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &pro } template -join_result hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, - std::vector const &probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair, std::unique_ptr> +hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, + std::vector const &probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { auto join_indices = compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); @@ -664,7 +668,8 @@ join_result hash_join::hash_join_impl::compute_hash_join(cudf::table_view const join_indices.second.release(), rmm::device_buffer{}, 0); - return join_result{std::move(left_map), std::move(right_map)}; + return std::make_pair, std::unique_ptr>( + std::move(left_map), std::move(right_map)); } template diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 7bc49bf8683..463902604c9 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -231,11 +231,12 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - join_result inner_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair, std::unique_ptr> inner_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, @@ -246,11 +247,12 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - join_result left_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair, std::unique_ptr> left_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; std::unique_ptr left_join( cudf::table_view const& probe, @@ -260,11 +262,12 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - join_result full_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair, std::unique_ptr> full_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; std::unique_ptr full_join( cudf::table_view const& probe, @@ -284,11 +287,12 @@ struct hash_join::hash_join_impl { rmm::mr::device_memory_resource* mr) const; template - join_result compute_hash_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair, std::unique_ptr> compute_hash_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; /** * @brief Performs hash join by probing the columns provided in `probe` as per diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 75bda62d809..61a0a85ef50 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -26,13 +26,14 @@ namespace cudf { namespace detail { -join_result inner_join(table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair, std::unique_ptr> inner_join( + table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -110,13 +111,14 @@ std::unique_ptr
inner_join( } } -join_result left_join(table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair, std::unique_ptr> left_join( + table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -156,13 +158,14 @@ std::unique_ptr
left_join( return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr); } -join_result full_join(table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair, std::unique_ptr> full_join( + table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -214,11 +217,12 @@ hash_join::hash_join(cudf::table_view const& build, { } -join_result hash_join::inner_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair, std::unique_ptr> hash_join::inner_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->inner_join(probe, probe_on, compare_nulls, stream, mr); } @@ -236,11 +240,12 @@ std::pair, std::unique_ptr> hash_join: probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } -join_result hash_join::left_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair, std::unique_ptr> hash_join::left_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->left_join(probe, probe_on, compare_nulls, stream, mr); } @@ -256,11 +261,12 @@ std::unique_ptr hash_join::left_join( return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } -join_result hash_join::full_join(cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair, std::unique_ptr> hash_join::full_join( + cudf::table_view const& probe, + std::vector const& probe_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->full_join(probe, probe_on, compare_nulls, stream, mr); } @@ -278,12 +284,13 @@ std::unique_ptr hash_join::full_join( // external APIs -join_result inner_join(table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair, std::unique_ptr> inner_join( + table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::inner_join( @@ -304,12 +311,13 @@ std::unique_ptr
inner_join( left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } -join_result left_join(table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair, std::unique_ptr> left_join( + table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_join( @@ -330,12 +338,13 @@ std::unique_ptr
left_join( left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } -join_result full_join(table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair, std::unique_ptr> full_join( + table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::full_join( diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 5b5b4253961..e4deea9fe1a 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1451,12 +1451,10 @@ TEST_F(JoinTest, InnerJoinNoNullsGathermap) Table t1(std::move(cols1)); auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); - auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.left_indices->view()})); - auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.right_indices->view()})); - auto lmap_sorted = - cudf::gather(cudf::table_view({result.left_indices->view()}), *lmap_sort_order); - auto rmap_sorted = - cudf::gather(cudf::table_view({result.right_indices->view()}), *rmap_sort_order); + auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); + auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); + auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); + auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); column_wrapper lmap_gold{{0, 2, 4}}; column_wrapper rmap_gold{{1, 1, 4}}; From 362829bce1b6a5cf89cb6cf9a711a9fec5cf93c5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Jan 2021 10:48:21 -0500 Subject: [PATCH 017/138] Add gathermap test for outer join --- cpp/src/join/hash_join.cu | 4 +- cpp/tests/join/join_tests.cpp | 74 ++++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index e7c8f1959d3..2bc7396115c 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -556,7 +556,7 @@ hash_join::hash_join_impl::left_join(cudf::table_view const &probe, rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( + return compute_hash_join( probe, probe_on, compare_nulls, stream, mr); } @@ -589,7 +589,7 @@ hash_join::hash_join_impl::full_join(cudf::table_view const &probe, rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( + return compute_hash_join( probe, probe_on, compare_nulls, stream, mr); } diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index e4deea9fe1a..7aaef599297 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -33,11 +33,15 @@ #include #include +#include + template using column_wrapper = cudf::test::fixed_width_column_wrapper; using strcol_wrapper = cudf::test::strings_column_wrapper; using CVector = std::vector>; using Table = cudf::table; +constexpr cudf::size_type NoneValue = + std::numeric_limits::min(); // TODO: how to test if this isn't public? struct JoinTest : public cudf::test::BaseFixture { }; @@ -1429,7 +1433,7 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls) CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); } -TEST_F(JoinTest, InnerJoinNoNullsGathermap) +TEST_F(JoinTest, InnerJoinGathermap) { column_wrapper col0_0{{3, 1, 2, 0, 2}}; strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); @@ -1462,4 +1466,72 @@ TEST_F(JoinTest, InnerJoinNoNullsGathermap) CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); } +TEST_F(JoinTest, LeftJoinGathermap) +{ + column_wrapper col0_0{{3, 1, 2, 0, 3}}; + strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); + auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); + auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); + auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); + auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); + + column_wrapper lmap_gold{{0, 1, 2, 3, 4}}; + column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); +} + +TEST_F(JoinTest, FullJoinGatherMap) +{ + column_wrapper col0_0{{3, 1, 2, 0, 3}}; + strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; + strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); + auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); + auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); + auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); + auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); + + column_wrapper lmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3}}; + column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); +} + CUDF_TEST_PROGRAM_MAIN() From 4e4380cec9d6adc0f794bf4aec733443bf1610e9 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Jan 2021 17:30:38 -0500 Subject: [PATCH 018/138] Add and pass full join gathermap test --- cpp/src/join/hash_join.cu | 35 ++++++++++++++++++++--------------- cpp/src/join/hash_join.cuh | 7 ++----- cpp/tests/join/join_tests.cpp | 4 ++-- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 2bc7396115c..16a453d7068 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -432,25 +432,22 @@ std::pair, std::unique_ptr
> construct_join_output_ std::unique_ptr
common_table = std::make_unique
(); // Construct the joined columns if (join_kind::FULL_JOIN == JoinKind) { - auto complement_indices = get_left_join_indices_complement( - joined_indices.second, probe.num_rows(), build.num_rows(), stream); if (not columns_in_common.empty()) { auto common_from_build = detail::gather(build.select(build_common_col), - complement_indices.second.begin(), - complement_indices.second.end(), + joined_indices.second.begin() + probe.num_rows(), + joined_indices.second.end(), bounds_policy, stream, rmm::mr::get_current_device_resource()); auto common_from_probe = detail::gather(probe.select(probe_common_col), joined_indices.first.begin(), - joined_indices.first.end(), + joined_indices.first.begin() + probe.num_rows(), bounds_policy, stream, rmm::mr::get_current_device_resource()); common_table = cudf::detail::concatenate( - {common_from_build->view(), common_from_probe->view()}, stream, mr); + {common_from_probe->view(), common_from_build->view()}, stream, mr); } - joined_indices = concatenate_vector_pairs(complement_indices, joined_indices, stream); } else { if (not columns_in_common.empty()) { common_table = detail::gather(probe.select(probe_common_col), @@ -641,10 +638,7 @@ hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &pro [](const auto &b, const auto &p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) - ? cudf::detail::join_kind::LEFT_JOIN - : JoinKind; - return probe_join_indices(probe_selected, compare_nulls, stream); + return probe_join_indices(probe_selected, compare_nulls, stream); } template @@ -706,14 +700,13 @@ hash_join::hash_join_impl::compute_hash_join( } template -std::enable_if_t, rmm::device_uvector>> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const { // Trivial left join case - exit early - if (!_hash_table && JoinKind == cudf::detail::join_kind::LEFT_JOIN) { + if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) { return get_trivial_left_join_indices(probe, stream); } @@ -721,8 +714,20 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, auto build_table = cudf::table_device_view::create(_build_selected, stream); auto probe_table = cudf::table_device_view::create(probe, stream); - return cudf::detail::probe_join_hash_table( + + constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) + ? cudf::detail::join_kind::LEFT_JOIN + : JoinKind; + auto join_indices = cudf::detail::probe_join_hash_table( *build_table, *probe_table, *_hash_table, compare_nulls, stream); + + if (JoinKind == cudf::detail::join_kind::FULL_JOIN) { + auto complement_indices = detail::get_left_join_indices_complement( + join_indices.second, probe.num_rows(), _build.num_rows(), stream); + join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + } + + return join_indices; } } // namespace cudf diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 463902604c9..c33029dea55 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -361,11 +361,8 @@ struct hash_join::hash_join_impl { * @return Join output indices vector pair. */ template - std::enable_if_t, rmm::device_uvector>> - probe_join_indices(cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream) const; + std::pair, rmm::device_uvector> probe_join_indices( + cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const; }; } // namespace cudf diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 7aaef599297..f0748e2ec29 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1527,8 +1527,8 @@ TEST_F(JoinTest, FullJoinGatherMap) auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - column_wrapper lmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3}}; - column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3}}; + column_wrapper lmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; + column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); From 339a13d15f74742dd916235f7bc0f90bcef63cf0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Jan 2021 11:56:18 -0500 Subject: [PATCH 019/138] Begin Python-side refactor --- python/cudf/cudf/_lib/cpp/join.pxd | 18 ++--- python/cudf/cudf/_lib/join.pyx | 112 +++-------------------------- 2 files changed, 19 insertions(+), 111 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd index 10edf370f5d..b25062d529e 100644 --- a/python/cudf/cudf/_lib/cpp/join.pxd +++ b/python/cudf/cudf/_lib/cpp/join.pxd @@ -4,32 +4,32 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from libcpp.pair cimport pair from libcpp cimport bool +from libcpp.pair cimport pair +from libcpp.memory cimport unique_ptr +from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/join.hpp" namespace "cudf" nogil: - cdef unique_ptr[table] inner_join( + cdef pair[unique_ptr[column], unique_ptr[column]] inner_join( const table_view left, const table_view right, const vector[int] left_on, - const vector[int] right_on, - const vector[pair[int, int]] columns_in_common + const vector[int] right_on ) except + - cdef unique_ptr[table] left_join( + cdef pair[unique_ptr[column], unique_ptr[column]] left_join( const table_view left, const table_view right, const vector[int] left_on, - const vector[int] right_on, - const vector[pair[int, int]] columns_in_common + const vector[int] right_on ) except + - cdef unique_ptr[table] full_join( + cdef pair[unique_ptr[column], unique_ptr[column]] full_join( const table_view left, const table_view right, const vector[int] left_on, - const vector[int] right_on, - const vector[pair[int, int]] columns_in_common + const vector[int] right_on ) except + cdef unique_ptr[table] left_semi_join( const table_view left, diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 38f13b9f994..9291c42625f 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -9,12 +9,15 @@ from libcpp.vector cimport vector from libcpp.pair cimport pair from libcpp cimport bool +from cudf._lib.column cimport Column from cudf._lib.table cimport Table, columns_from_ptr +from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join + cpdef join(Table lhs, Table rhs, object how, @@ -27,7 +30,6 @@ cpdef join(Table lhs, """ Call libcudf++ join for full outer, inner and left joins. """ - cdef Table c_lhs = lhs cdef Table c_rhs = rhs @@ -51,17 +53,7 @@ cpdef join(Table lhs, cdef vector[int] all_left_inds = range( lhs._num_columns + (lhs._num_indices * left_index) ) - cdef vector[int] all_right_inds = range( - rhs._num_columns + (rhs._num_indices * right_index) - ) - - result_col_names = compute_result_col_names(lhs, rhs, how) - columns_in_common = OrderedDict() - cdef vector[pair[int, int]] c_columns_in_common - - # keep track of where the desired index column will end up - result_index_pos = None if left_index or right_index: # If either true, we need to process both indices as columns lhs_view = c_lhs.view() @@ -69,54 +61,27 @@ cpdef join(Table lhs, left_join_cols = list(lhs._index_names) + list(lhs._data.keys()) right_join_cols = list(rhs._index_names) + list(rhs._data.keys()) + if left_index and right_index: # Index columns will be common, on the left, dropped from right # Index name is from the left # Both views, must take index column indices left_on_indices = right_on_indices = range(lhs._num_indices) - result_idx_positions = range(lhs._num_indices) - result_index_names = lhs._index_names - elif left_index: # Joins left index columns with right 'on' columns left_on_indices = range(lhs._num_indices) right_on_indices = [ right_join_cols.index(on_col) for on_col in right_on ] - - # The left index columns 'become' the new RHS columns - # and the right index 'survives' - result_idx_positions = range( - len(left_join_cols), len(left_join_cols) + lhs._num_indices - ) - result_index_names = rhs._index_names - - # but since the common columns are gathered from the left - # the rhs 'on' cols are returned on the left of the result - # rearrange the names so account for this - common = [None] * rhs._num_indices - for i in range(rhs._num_indices): - common[i] = result_col_names.pop( - result_col_names.index(right_on[i]) - ) - result_col_names = common + result_col_names elif right_index: # Joins right index columns with left 'on' columns right_on_indices = range(rhs._num_indices) left_on_indices = [ left_join_cols.index(on_col) for on_col in left_on ] - - # The right index columns 'become' the new LHS columns - # and the left index survives - # since they are already gathered from the left, - # no rearranging has to be done - result_idx_positions = range(lhs._num_indices) - result_index_names = lhs._index_names for i_l, i_r in zip(left_on_indices, right_on_indices): left_on_ind.push_back(i_l) right_on_ind.push_back(i_r) - columns_in_common[(i_l, i_r)] = None else: # cuDF's Python layer will create a new RangeIndex for this case lhs_view = c_lhs.data_view() @@ -131,16 +96,10 @@ cpdef join(Table lhs, if left_index == right_index: for name in left_on: left_on_ind.push_back(left_join_cols.index(name)) - if name in right_on: - if (left_on.index(name) == right_on.index(name)): - columns_in_common[( - left_join_cols.index(name), - right_join_cols.index(name) - )] = None for name in right_on: right_on_ind.push_back(right_join_cols.index(name)) - c_columns_in_common = list(columns_in_common.keys()) - cdef unique_ptr[table] c_result + + cdef pair[unique_ptr[column], unique_ptr[column]] c_result if how == 'inner': with nogil: c_result = move(cpp_join.inner_join( @@ -148,7 +107,6 @@ cpdef join(Table lhs, rhs_view, left_on_ind, right_on_ind, - c_columns_in_common )) elif how == 'left': with nogil: @@ -157,7 +115,6 @@ cpdef join(Table lhs, rhs_view, left_on_ind, right_on_ind, - c_columns_in_common )) elif how == 'outer': with nogil: @@ -166,57 +123,8 @@ cpdef join(Table lhs, rhs_view, left_on_ind, right_on_ind, - c_columns_in_common - )) - elif how == 'leftsemi': - with nogil: - c_result = move(cpp_join.left_semi_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - all_left_inds )) - elif how == 'leftanti': - with nogil: - c_result = move(cpp_join.left_anti_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - all_left_inds - )) - - all_cols_py = columns_from_ptr(move(c_result)) - if left_index or right_index: - ind_cols = OrderedDict() - for name, pos in zip( - result_index_names[::-1], result_idx_positions[::-1] - ): - ind_cols[name] = all_cols_py.pop(pos) - index = OrderedDict() - for k, v in reversed(ind_cols.items()): - index[k] = v - index = Table(index) - else: - index = None - data_ordered_dict = OrderedDict(zip(result_col_names, all_cols_py)) - return Table(data=data_ordered_dict, index=index) - - -def compute_result_col_names(lhs, rhs, how): - """ - Determine the names of the data columns in the result of - a libcudf join, based on the original left and right frames - as well as the type of join that was performed. - """ - if how in {"left", "inner", "outer", "leftsemi", "leftanti"}: - a = lhs._data.keys() - if how not in {"leftsemi", "leftanti"}: - return list(chain(a, (k for k in rhs._data.keys() - if k not in lhs._data.keys()))) - return list(a) - else: - raise NotImplementedError( - f"{how} merge not supported yet" - ) + return ( + Column.from_unique_ptr(move(c_result.first)), + Column.from_unique_ptr(move(c_result.second)) + ) From 044eac1c4f9c7df9e657c229c43689b3def3e8db Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 8 Feb 2021 14:03:35 -0500 Subject: [PATCH 020/138] Add left_semi and left_anti join APIs that return gathermaps --- cpp/include/cudf/join.hpp | 18 ++++ cpp/src/join/semi_join.cu | 167 ++++++++++++++++++----------- cpp/tests/join/semi_join_tests.cpp | 7 +- 3 files changed, 130 insertions(+), 62 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 9de03031a37..0a42d28de2d 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -328,6 +328,15 @@ std::unique_ptr left_semi_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** TODO: document */ +std::unique_ptr left_semi_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left anti join on the specified columns of two * tables (`left`, `right`) @@ -384,6 +393,15 @@ std::unique_ptr left_anti_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** TODO: document */ +std::unique_ptr left_anti_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a cross join on two tables (`left`, `right`) * diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 9d046f9983c..59298c75f1e 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -26,11 +26,88 @@ #include #include +#include #include #include namespace cudf { namespace detail { + +template +std::unique_ptr left_semi_anti_join( + cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty"); + CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty"); + CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); + + auto const left_num_rows = left.num_rows(); + auto const right_num_rows = right.num_rows(); + + // Only care about existence, so we'll use an unordered map (other joins need a multimap) + using hash_table_type = concurrent_unordered_map; + + // Create hash table containing all keys found in right table + auto right_rows_d = table_device_view::create(right, stream); + size_t const hash_table_size = compute_hash_table_size(right_num_rows); + row_hash hash_build{*right_rows_d}; + row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; + + // Going to join it with left table + auto left_rows_d = table_device_view::create(left, stream); + row_hash hash_probe{*left_rows_d}; + row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; + + auto hash_table_ptr = hash_table_type::create(hash_table_size, + stream, + std::numeric_limits::max(), + std::numeric_limits::max(), + hash_build, + equality_build); + auto hash_table = *hash_table_ptr; + + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + right_num_rows, + [hash_table] __device__(size_type idx) mutable { + hash_table.insert(thrust::make_pair(idx, true)); + }); + + // + // Now we have a hash table, we need to iterate over the rows of the left table + // and check to see if they are contained in the hash table + // + + // For semi join we want contains to be true, for anti join we want contains to be false + bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); + + rmm::device_uvector gather_map(left_num_rows, stream); + + // gather_map_end will be the end of valid data in gather_map + auto gather_map_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(left_num_rows), + gather_map.begin(), + [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) { + auto pos = hash_table.find(idx, hash_probe, equality_probe); + return (pos != hash_table.end()) == join_type_boolean; + }); + + auto join_size = std::distance(gather_map.begin(), gather_map_end); + return std::make_unique(cudf::data_type(type_to_id()), + join_size, + gather_map.release(), + rmm::device_buffer{}, + 0); +} + /** * @brief Performs a left semi or anti join on the specified columns of two * tables (left, right) @@ -80,20 +157,13 @@ std::unique_ptr left_semi_anti_join( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty"); - CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty"); - CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); - if (0 == return_columns.size()) { return empty_like(left.select(return_columns)); } if (is_trivial_join(left, right, left_on, right_on, JoinKind)) { return empty_like(left.select(return_columns)); } - auto const left_num_rows = left.num_rows(); - auto const right_num_rows = right.num_rows(); - - if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_num_rows)) { + if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) { // Everything matches, just copy the proper columns from the left table return std::make_unique
(left.select(return_columns), stream, mr); } @@ -108,65 +178,18 @@ std::unique_ptr left_semi_anti_join( auto const left_selected = matched.second.front(); auto const right_selected = matched.second.back(); - // Only care about existence, so we'll use an unordered map (other joins need a multimap) - using hash_table_type = concurrent_unordered_map; - - // Create hash table containing all keys found in right table - auto right_rows_d = table_device_view::create(right_selected, stream); - size_t const hash_table_size = compute_hash_table_size(right_num_rows); - row_hash hash_build{*right_rows_d}; - row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; + auto gather_map = left_semi_anti_join( + left_selected, right_selected, left_on, right_on, compare_nulls, stream); - // Going to join it with left table - auto left_rows_d = table_device_view::create(left_selected, stream); - row_hash hash_probe{*left_rows_d}; - row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; - - auto hash_table_ptr = hash_table_type::create(hash_table_size, - stream, - std::numeric_limits::max(), - std::numeric_limits::max(), - hash_build, - equality_build); - auto hash_table = *hash_table_ptr; - - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - right_num_rows, - [hash_table] __device__(size_type idx) mutable { - hash_table.insert(thrust::make_pair(idx, true)); - }); - - // - // Now we have a hash table, we need to iterate over the rows of the left table - // and check to see if they are contained in the hash table - // - - // For semi join we want contains to be true, for anti join we want contains to be false - bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); - - rmm::device_vector gather_map(left_num_rows); - - // gather_map_end will be the end of valid data in gather_map - auto gather_map_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(left_num_rows), - gather_map.begin(), - [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) { - auto pos = hash_table.find(idx, hash_probe, equality_probe); - return (pos != hash_table.end()) == join_type_boolean; - }); - - // rebuild left table for call to gather auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather(left_updated.select(return_columns), - gather_map.begin(), - gather_map_end, + gather_map->view().template begin(), + gather_map->view().template end(), out_of_bounds_policy::DONT_CHECK, stream, mr); } + } // namespace detail std::unique_ptr left_semi_join(cudf::table_view const& left, @@ -182,6 +205,18 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); } +std::unique_ptr left_semi_join(cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::left_semi_anti_join( + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + std::unique_ptr left_anti_join(cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, @@ -195,4 +230,16 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); } +std::unique_ptr left_anti_join(cudf::table_view const& left, + cudf::table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::left_semi_anti_join( + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + } // namespace cudf diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp index 13c74616484..a665d07ee3c 100644 --- a/cpp/tests/join/semi_join_tests.cpp +++ b/cpp/tests/join/semi_join_tests.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -476,7 +477,8 @@ TEST_F(JoinTest, LeftSemiJoin_empty_result) cudf::table table_a(std::move(column_a)); cudf::table table_b(std::move(column_b)); - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {}); + auto join_table = + cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, std::vector{}); EXPECT_EQ(join_table->num_columns(), 0); EXPECT_EQ(join_table->num_rows(), 0); @@ -540,7 +542,8 @@ TEST_F(JoinTest, LeftAntiJoin_empty_result) cudf::table table_a(std::move(column_a)); cudf::table table_b(std::move(column_b)); - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {}); + auto join_table = + cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, std::vector{}); EXPECT_EQ(join_table->num_columns(), 0); EXPECT_EQ(join_table->num_rows(), 0); From 555d5ec5ad9ca04142e8b1c6a9448637f9d900e8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 8 Feb 2021 16:08:19 -0500 Subject: [PATCH 021/138] Add Cython bindings --- python/cudf/cudf/_lib/cpp/join.pxd | 10 +- python/cudf/cudf/_lib/join.pyx | 161 ++++++++++------------------- 2 files changed, 62 insertions(+), 109 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd index b25062d529e..55180e2b74e 100644 --- a/python/cudf/cudf/_lib/cpp/join.pxd +++ b/python/cudf/cudf/_lib/cpp/join.pxd @@ -19,29 +19,31 @@ cdef extern from "cudf/join.hpp" namespace "cudf" nogil: const vector[int] left_on, const vector[int] right_on ) except + + cdef pair[unique_ptr[column], unique_ptr[column]] left_join( const table_view left, const table_view right, const vector[int] left_on, const vector[int] right_on ) except + + cdef pair[unique_ptr[column], unique_ptr[column]] full_join( const table_view left, const table_view right, const vector[int] left_on, const vector[int] right_on ) except + - cdef unique_ptr[table] left_semi_join( + + cdef unique_ptr[column] left_semi_join( const table_view left, const table_view right, const vector[int] left_on, const vector[int] right_on, - const vector[int] return_columns ) except + - cdef unique_ptr[table] left_anti_join( + + cdef unique_ptr[column] left_anti_join( const table_view left, const table_view right, const vector[int] left_on, const vector[int] right_on, - const vector[int] return_columns ) except + diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 9291c42625f..94d82aa2638 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -18,113 +18,64 @@ from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join -cpdef join(Table lhs, - Table rhs, - object how, - object method, - object left_on=None, - object right_on=None, - bool left_index=False, - bool right_index=False - ): - """ - Call libcudf++ join for full outer, inner and left joins. - """ - cdef Table c_lhs = lhs - cdef Table c_rhs = rhs - - # Views might or might not include index - cdef table_view lhs_view - cdef table_view rhs_view - - # Will hold the join column indices into L and R tables - cdef vector[int] left_on_ind - cdef vector[int] right_on_ind - - # If left/right index, will pass a full view - # must offset the data column indices by # of index columns - num_inds_left = len(left_on) + (lhs._num_indices * left_index) - num_inds_right = len(right_on) + (rhs._num_indices * right_index) - left_on_ind.reserve(num_inds_left) - right_on_ind.reserve(num_inds_right) - - # Only used for semi or anti joins - # The result columns are only the left hand columns - cdef vector[int] all_left_inds = range( - lhs._num_columns + (lhs._num_indices * left_index) - ) - - if left_index or right_index: - # If either true, we need to process both indices as columns - lhs_view = c_lhs.view() - rhs_view = c_rhs.view() - - left_join_cols = list(lhs._index_names) + list(lhs._data.keys()) - right_join_cols = list(rhs._index_names) + list(rhs._data.keys()) - - if left_index and right_index: - # Index columns will be common, on the left, dropped from right - # Index name is from the left - # Both views, must take index column indices - left_on_indices = right_on_indices = range(lhs._num_indices) - elif left_index: - # Joins left index columns with right 'on' columns - left_on_indices = range(lhs._num_indices) - right_on_indices = [ - right_join_cols.index(on_col) for on_col in right_on - ] - elif right_index: - # Joins right index columns with left 'on' columns - right_on_indices = range(rhs._num_indices) - left_on_indices = [ - left_join_cols.index(on_col) for on_col in left_on - ] - for i_l, i_r in zip(left_on_indices, right_on_indices): - left_on_ind.push_back(i_l) - right_on_ind.push_back(i_r) - else: - # cuDF's Python layer will create a new RangeIndex for this case - lhs_view = c_lhs.data_view() - rhs_view = c_rhs.data_view() - - left_join_cols = list(lhs._data.keys()) - right_join_cols = list(rhs._data.keys()) - - # If both left/right_index, joining on indices plus additional cols - # If neither, joining on just cols, not indices - # In both cases, must match up additional column indices in lhs/rhs - if left_index == right_index: - for name in left_on: - left_on_ind.push_back(left_join_cols.index(name)) - for name in right_on: - right_on_ind.push_back(right_join_cols.index(name)) - +cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): + # left, inner and outer join + cdef vector[int] c_left_on = left_on + cdef vector[int] c_right_on = right_on cdef pair[unique_ptr[column], unique_ptr[column]] c_result - if how == 'inner': - with nogil: - c_result = move(cpp_join.inner_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - )) - elif how == 'left': - with nogil: - c_result = move(cpp_join.left_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - )) - elif how == 'outer': - with nogil: - c_result = move(cpp_join.full_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - )) + cdef table_view c_lhs = lhs.view() + cdef table_view c_rhs = rhs.view() + + if how == "inner": + c_result = move(cpp_join.inner_join( + c_lhs, + c_rhs, + c_left_on, + c_right_on, + )) + elif how == "left": + c_result = move(cpp_join.left_join( + c_lhs, + c_rhs, + c_left_on, + c_right_on, + )) + elif how == "outer": + c_result = move(cpp_join.outer_join( + c_lhs, + c_rhs + c_left_on, + c_right_on + )) + else: + raise ValueError(f"Unkown join type {how}") return ( Column.from_unique_ptr(move(c_result.first)), Column.from_unique_ptr(move(c_result.second)) ) + + +cpdef join_semi_anti(Table lhs, Table rhs, left_on, right_on, how=None): + # left-semi and left-anti joins + cdef vector[int] c_left_on = left_on + cdef vector[int] c_right_on = right_on + cdef unique_ptr[column] c_result + cdef table_view c_lhs = lhs.view() + cdef table_view c_rhs = rhs.view() + + if how == "semi": + c_result = move(cpp_join.left_semi_join( + c_lhs, + c_rhs, + c_left_on, + c_right_on + )) + elif how == "anti": + c_result = move(cpp_join.left_anti_join( + c_lhs, + c_rhs, + c_left_on, + c_right_on + )) + else: + raise ValueError(f"Invalid join type {how}") From 56ae6162fbb25477b1edc6e0f68c9b3276df8f83 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Feb 2021 09:45:41 -0500 Subject: [PATCH 022/138] full -> outer --- python/cudf/cudf/_lib/join.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 94d82aa2638..5d07604f3bb 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -41,9 +41,9 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): c_right_on, )) elif how == "outer": - c_result = move(cpp_join.outer_join( + c_result = move(cpp_join.full_join( c_lhs, - c_rhs + c_rhs, c_left_on, c_right_on )) From d447924fcbc24042459231e316b5786683788ad0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Feb 2021 16:38:21 -0500 Subject: [PATCH 023/138] Progress --- python/cudf/cudf/core/join/join.py | 347 ++++++++++++++--------------- 1 file changed, 166 insertions(+), 181 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index c6da3ee8dc4..257741e6f4c 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,34 +1,88 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import itertools +from collections import namedtuple import pandas as pd import cudf from cudf import _lib as libcudf -from cudf._lib.join import compute_result_col_names from cudf.core.join.casting_logic import ( _input_to_libcudf_castrules_any, _libcudf_to_output_castrules, ) +class _MISSING_TYPE: + pass + + +MISSING = _MISSING_TYPE() + + +class JoinKey: + # A JoinKey represents one column of a Series + # or DataFrame - either an index column or a + # data column + + # we need a different sentinel value than `None` + # because `None` is totally a valid index/column name + def __init__(self, obj, column=MISSING, index=MISSING): + self.obj = obj + self.column, self.index = column, index + + def get_numeric_index(self): + # get the position of the column (including any index columns) + if self.index is MISSING: + return len(self.obj.index.names) + self.obj.columns.get_loc( + self.column + ) + else: + return self.obj.index.names.index(self.index) + + @property + def name(self): + # get the name of the column + if self.index is MISSING: + return self.column + else: + return self.index + + @property + def value(self): + # get the column + if self.index is MISSING: + return self.obj._data[self.name] + else: + return self.obj._index._data[self.name] + + def set_value(self, value): + # set the colum + if self.index is MISSING: + self.obj._data[self.name] = value + else: + self.obj._index._data[self.name] = value + + +JoinKeys = namedtuple("JoinKeys", ["left", "right"]) + + class Merge(object): def __init__( self, lhs, rhs, - on, - left_on, - right_on, - left_index, - right_index, - how, - sort, - lsuffix, - rsuffix, - method, - indicator, - suffixes, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + how="inner", + sort=False, + lsuffix="_x", + rsuffix="_y", + method=None, + indicator=None, + suffixes=None, ): """ Manage the merging of two Frames. @@ -72,32 +126,89 @@ def __init__( Left and right suffixes specified together, unpacked into lsuffix and rsuffix. """ - self.lhs = lhs - self.rhs = rhs - self.left_index = left_index - self.right_index = right_index - self.method = method - self.sort = sort - - # check that the merge is valid - - self.validate_merge_cfg( + self.validate_merge_params( lhs, rhs, - on, - left_on, - right_on, - left_index, - right_index, - how, - lsuffix, - rsuffix, - suffixes, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + lsuffix=lsuffix, + rsuffix=rsuffix, + suffixes=suffixes, ) + + # warning: self.lhs and self.rhs are mutated both before + # and after the join + self.lhs = lhs.copy(deep=False) + self.rhs = rhs.copy(deep=False) + + self.on = on + self.left_on = left_on + self.right_on = right_on + self.left_index = left_index + self.right_index = right_index self.how = how - self.preprocess_merge_params( - on, left_on, right_on, lsuffix, rsuffix, suffixes - ) + self.lsuffix = lsuffix + self.rsuffix = rsuffix + self.suffixes = suffixes + + self.compute_join_keys() + + def compute_join_keys(self): + def _coerce_to_tuple(obj): + if hasattr(obj, "__iter__") and not isinstance(obj, str): + return tuple(obj) + else: + return (obj,) + + if ( + self.left_index + or self.right_index + or self.left_on + or self.right_on + ): + if self.left_index: + left_keys = [ + JoinKey(obj=self.lhs, index=on) + for on in self.lhs.index.names + ] + else: + # TODO: require left_on or left_index to be specified + left_keys = [ + JoinKey(obj=self.lhs, column=on) + for on in _coerce_to_tuple(self.left_on) + ] + if self.right_index: + right_keys = [ + JoinKey(obj=self.rhs, index=on) + for on in self.rhs.index.names + ] + else: + # TODO: require right_on or right_index to be specified + right_keys = [ + JoinKey(obj=self.rhs, column=on) + for on in _coerce_to_tuple(self.right_on) + ] + else: + # Use `on` if provided. Otherwise, + # implicitly use identically named columns as the key columns: + on_names = ( + _coerce_to_tuple(self.on) + if self.on is not None + else set(self.lhs._data.keys()) & set(self.rhs._data.keys()) + ) + left_keys = [JoinKey(obj=self.lhs, column=on) for on in on_names] + right_keys = [JoinKey(obj=self.rhs, column=on) for on in on_names] + + if len(left_keys) != len(right_keys): + raise ValueError( + "Merge operands must have same number of join key columns" + ) + + self._keys = JoinKeys(left=left_keys, right=right_keys) def perform_merge(self): """ @@ -105,9 +216,8 @@ def perform_merge(self): necessary, cast the input key columns to compatible types. Potentially also cast the output back to categorical. """ - output_dtypes = self.compute_output_dtypes() - self.typecast_input_to_libcudf() - libcudf_result = libcudf.join.join( + self.match_key_dtypes(_input_to_libcudf_castrules_any) + left_rows, right_rows = libcudf.join.join( self.lhs, self.rhs, self.how, @@ -117,83 +227,24 @@ def perform_merge(self): left_index=self.left_index, right_index=self.right_index, ) - result = self.out_class._from_table(libcudf_result) - result = self.typecast_libcudf_to_output(result, output_dtypes) - if isinstance(result, cudf.Index): - return result - else: - return result[ - compute_result_col_names(self.lhs, self.rhs, self.how) - ] - - def preprocess_merge_params( - self, on, left_on, right_on, lsuffix, rsuffix, suffixes - ): - """ - Translate a valid configuration of user input parameters into - the subset of input configurations handled by the cython layer. - Apply suffixes to columns. - """ + return self.construct_result(left_rows, right_rows) - self.out_class = cudf.DataFrame - if isinstance(self.lhs, cudf.MultiIndex) or isinstance( - self.rhs, cudf.MultiIndex - ): - self.out_class = cudf.MultiIndex - elif isinstance(self.lhs, cudf.Index): - self.out_class = self.lhs.__class__ + def construct_result(self, left_rows, right_rows): + self.match_key_dtypes(_libcudf_to_output_castrules) - if on: - on = [on] if isinstance(on, str) else list(on) - left_on = right_on = on + # first construct the index: + if self.left_index and not self.right_index: + out_index = self.rhs.index.iloc[right_rows] + elif self.right_index and not self.left_index: + out_index = self.lhs.index.iloc[left_rows] else: - if left_on: - left_on = ( - [left_on] if isinstance(left_on, str) else list(left_on) - ) - if right_on: - right_on = ( - [right_on] if isinstance(right_on, str) else list(right_on) - ) - - same_named_columns = set(self.lhs._data.keys()) & set( - self.rhs._data.keys() - ) - if not (left_on or right_on) and not ( - self.left_index and self.right_index - ): - left_on = right_on = list(same_named_columns) - - no_suffix_cols = [] - if left_on and right_on: - no_suffix_cols = [ - left_name - for left_name, right_name in zip(left_on, right_on) - if left_name == right_name and left_name in same_named_columns - ] - - if suffixes: - lsuffix, rsuffix = suffixes - for name in same_named_columns: - if name not in no_suffix_cols: - self.lhs.rename( - {name: f"{name}{lsuffix}"}, inplace=True, axis=1 - ) - self.rhs.rename( - {name: f"{name}{rsuffix}"}, inplace=True, axis=1 - ) - if left_on and name in left_on: - left_on[left_on.index(name)] = f"{name}{lsuffix}" - if right_on and name in right_on: - right_on[right_on.index(name)] = f"{name}{rsuffix}" + out_index = None - self.left_on = left_on if left_on is not None else [] - self.right_on = right_on if right_on is not None else [] - self.lsuffix = lsuffix - self.rsuffix = rsuffix + # now construct the data: + return out_index @staticmethod - def validate_merge_cfg( + def validate_merge_params( lhs, rhs, on, @@ -227,50 +278,6 @@ def validate_merge_cfg( ): raise ValueError("Can not merge on unnamed Series") - # Keys need to be in their corresponding operands - if on: - if isinstance(on, str): - on_keys = [on] - elif isinstance(on, tuple): - on_keys = list(on) - else: - on_keys = on - for key in on_keys: - if not (key in lhs._data.keys() and key in rhs._data.keys()): - raise KeyError(f"on key {on} not in both operands") - elif left_on and right_on: - left_on_keys = ( - [left_on] if not isinstance(left_on, list) else left_on - ) - right_on_keys = ( - [right_on] if not isinstance(right_on, list) else right_on - ) - - for key in left_on_keys: - if key not in lhs._data.keys(): - raise KeyError(f'Key "{key}" not in left operand') - for key in right_on_keys: - if key not in rhs._data.keys(): - raise KeyError(f'Key "{key}" not in right operand') - - # Require same total number of columns to join on in both operands - len_left_on = 0 - len_right_on = 0 - if left_on: - len_left_on += ( - len(left_on) if pd.api.types.is_list_like(left_on) else 1 - ) - if right_on: - len_right_on += ( - len(right_on) if pd.api.types.is_list_like(right_on) else 1 - ) - if not (len_left_on + left_index * lhs._num_indices) == ( - len_right_on + right_index * rhs._num_indices - ): - raise ValueError( - "Merge operands must have same number of join key columns" - ) - # If nothing specified, must have common cols to use implicitly same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys()) if ( @@ -297,39 +304,17 @@ def validate_merge_cfg( "lsuffix and rsuffix are not defined" ) - def typecast_input_to_libcudf(self): + def match_key_dtypes(self, match_func): """ Check each pair of join keys in the left and right hand operands and apply casting rules to match their types before passing the result to libcudf. """ - lhs_keys, rhs_keys, lhs_cols, rhs_cols = [], [], [], [] - if self.left_index: - lhs_keys.append(self.lhs.index._data.keys()) - lhs_cols.append(self.lhs.index) - if self.right_index: - rhs_keys.append(self.rhs.index._data.keys()) - rhs_cols.append(self.rhs.index) - if self.left_on: - lhs_keys.append(self.left_on) - lhs_cols.append(self.lhs) - if self.right_on: - rhs_keys.append(self.right_on) - rhs_cols.append(self.rhs) - - for l_key_grp, r_key_grp, l_col_grp, r_col_grp in zip( - lhs_keys, rhs_keys, lhs_cols, rhs_cols - ): - for l_key, r_key in zip(l_key_grp, r_key_grp): - to_dtype = _input_to_libcudf_castrules_any( - l_col_grp._data[l_key], r_col_grp._data[r_key], self.how - ) - l_col_grp._data[l_key] = l_col_grp._data[l_key].astype( - to_dtype - ) - r_col_grp._data[r_key] = r_col_grp._data[r_key].astype( - to_dtype - ) + for left_key, right_key in zip(self._keys.left, self._keys.right): + lcol, rcol = left_key.value, right_key.value + dtype = match_func(lcol, rcol, how=self.how) + left_key.set_value(lcol.astype(dtype)) + right_key.set_value(rcol.astype(dtype)) def compute_output_dtypes(self): """ From 484512eb17e51e75b3353f54edcfc5504a677f42 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Feb 2021 18:13:02 -0500 Subject: [PATCH 024/138] More progress on py refactor --- python/cudf/cudf/core/join/join.py | 176 ++++++++++------------------- 1 file changed, 60 insertions(+), 116 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 257741e6f4c..322913f93cf 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,8 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. -import itertools -from collections import namedtuple - -import pandas as pd +from collections import OrderedDict, namedtuple import cudf from cudf import _lib as libcudf @@ -39,6 +36,11 @@ def get_numeric_index(self): else: return self.obj.index.names.index(self.index) + @property + def is_index_level(self): + # True if this is an index column + return self.index is not MISSING + @property def name(self): # get the name of the column @@ -217,15 +219,18 @@ def perform_merge(self): Potentially also cast the output back to categorical. """ self.match_key_dtypes(_input_to_libcudf_castrules_any) + + left_key_indices = [key.get_numeric_index() for key in self._keys.left] + right_key_indices = [ + key.get_numeric_index() for key in self._keys.right + ] + breakpoint() left_rows, right_rows = libcudf.join.join( self.lhs, self.rhs, - self.how, - self.method, - left_on=self.left_on, - right_on=self.right_on, - left_index=self.left_index, - right_index=self.right_index, + left_on=left_key_indices, + right_on=right_key_indices, + how=self.how, ) return self.construct_result(left_rows, right_rows) @@ -234,14 +239,53 @@ def construct_result(self, left_rows, right_rows): # first construct the index: if self.left_index and not self.right_index: - out_index = self.rhs.index.iloc[right_rows] + # TODO: only gather on index columns: + out_index = self.rhs.index._gather(right_rows) elif self.right_index and not self.left_index: - out_index = self.lhs.index.iloc[left_rows] + # TODO: only gather on index columns: + out_index = self.lhs.index._gather(left_rows) else: out_index = None # now construct the data: - return out_index + data = cudf.core.column_accessor.ColumnAccessor() + left_names, right_names = self.output_column_names() + + for lcol in left_names: + data[left_names[lcol]] = self.lhs[lcol].iloc[left_rows] + for rcol in right_names: + data[right_names[rcol]] = self.rhs[rcol].iloc[right_rows] + return cudf.DataFrame._from_data(data, index=out_index) + + def output_column_names(self): + # Return mappings of input column names to (possibly) suffixed + # result column names + left_names = OrderedDict( + zip(self.lhs._data.keys(), self.lhs._data.keys()) + ) + right_names = OrderedDict( + zip(self.rhs._data.keys(), self.rhs._data.keys()) + ) + common_names = set(left_names) & set(right_names) + + if self.on: + key_columns_with_same_name = self.on + else: + key_columns_with_same_name = [] + for lkey, rkey in zip(self._keys.left, self._keys.right): + if (lkey.is_index_level, rkey.is_index_level) == ( + False, + False, + ): + if lkey.name == rkey.name: + key_columns_with_same_name.append(lkey.name) + for name in common_names: + if name not in key_columns_with_same_name: + left_names[name] = f"{name}{self.lsuffix}" + right_names[name] = f"{name}{self.rsuffix}" + else: + del right_names[name] + return left_names, right_names @staticmethod def validate_merge_params( @@ -305,111 +349,11 @@ def validate_merge_params( ) def match_key_dtypes(self, match_func): - """ - Check each pair of join keys in the left and right hand - operands and apply casting rules to match their types - before passing the result to libcudf. - """ + # match the dtypes of the key columns in + # self.lhs and self.rhs according to the matching + # function `match_func` for left_key, right_key in zip(self._keys.left, self._keys.right): lcol, rcol = left_key.value, right_key.value dtype = match_func(lcol, rcol, how=self.how) left_key.set_value(lcol.astype(dtype)) right_key.set_value(rcol.astype(dtype)) - - def compute_output_dtypes(self): - """ - Determine what datatypes should be applied to the result - of a libcudf join, baesd on the original left and right - frames. - """ - - index_dtypes = {} - l_data_join_cols = {} - r_data_join_cols = {} - - data_dtypes = { - name: col.dtype - for name, col in itertools.chain( - self.lhs._data.items(), self.rhs._data.items() - ) - } - - if self.left_index and self.right_index: - l_idx_join_cols = list(self.lhs.index._data.values()) - r_idx_join_cols = list(self.rhs.index._data.values()) - elif self.left_on and self.right_index: - # Keep the orignal dtypes in the LEFT index if possible - # should trigger a bunch of no-ops - l_idx_join_cols = list(self.lhs.index._data.values()) - r_idx_join_cols = list(self.lhs.index._data.values()) - for i, name in enumerate(self.left_on): - l_data_join_cols[name] = self.lhs._data[name] - r_data_join_cols[name] = list(self.rhs.index._data.values())[i] - - elif self.left_index and self.right_on: - # see above - l_idx_join_cols = list(self.rhs.index._data.values()) - r_idx_join_cols = list(self.rhs.index._data.values()) - for i, name in enumerate(self.right_on): - l_data_join_cols[name] = list(self.lhs.index._data.values())[i] - r_data_join_cols[name] = self.rhs._data[name] - - if self.left_on and self.right_on: - l_data_join_cols = self.lhs._data - r_data_join_cols = self.rhs._data - - if self.left_index or self.right_index: - for i in range(len(self.lhs.index._data.items())): - index_dtypes[i] = _libcudf_to_output_castrules( - l_idx_join_cols[i], r_idx_join_cols[i], self.how - ) - - for name in itertools.chain(self.left_on, self.right_on): - if name in self.left_on and name in self.right_on: - data_dtypes[name] = _libcudf_to_output_castrules( - l_data_join_cols[name], r_data_join_cols[name], self.how - ) - return (index_dtypes, data_dtypes) - - def typecast_libcudf_to_output(self, output, output_dtypes): - """ - Apply precomputed output index and data column data types - to the output of a libcudf join. - """ - - index_dtypes, data_dtypes = output_dtypes - if output._index and len(index_dtypes) > 0: - for index_dtype, index_col_lbl, index_col in zip( - index_dtypes.values(), - output._index._data.keys(), - output._index._data.values(), - ): - if index_dtype: - output._index._data[ - index_col_lbl - ] = self._build_output_col(index_col, index_dtype) - # reconstruct the Index object as the underlying data types - # have changed: - output._index = cudf.core.index.Index._from_table(output._index) - - for data_col_lbl, data_col in output._data.items(): - data_dtype = data_dtypes[data_col_lbl] - if data_dtype: - output._data[data_col_lbl] = self._build_output_col( - data_col, data_dtype - ) - return output - - def _build_output_col(self, col, dtype): - if isinstance( - dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype) - ): - outcol = cudf.core.column.build_categorical_column( - categories=dtype.categories, - codes=col.set_mask(None), - mask=col.base_mask, - ordered=dtype.ordered, - ) - else: - outcol = col.astype(dtype) - return outcol From 5227582c355aa684afb77ac7b8cb8e71feb0970c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 09:04:40 -0500 Subject: [PATCH 025/138] Remove breakpoint --- python/cudf/cudf/core/join/join.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 322913f93cf..fa7970dc8fa 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -224,7 +224,6 @@ def perform_merge(self): right_key_indices = [ key.get_numeric_index() for key in self._keys.right ] - breakpoint() left_rows, right_rows = libcudf.join.join( self.lhs, self.rhs, From 9cd870eb794538ded5e2668e9a7e6228a4bdcd50 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 09:51:03 -0500 Subject: [PATCH 026/138] Fix neg index handling --- cpp/src/copying/gather.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index dc153e9395d..181752d18e8 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -43,9 +43,7 @@ std::unique_ptr
gather(table_view const& source_table, if (neg_indices == negative_index_policy::ALLOWED) { cudf::size_type n_rows = source_table.num_rows(); - auto idx_converter = [n_rows] __device__(size_type in) { - return ((in % n_rows) + n_rows) % n_rows; - }; + auto idx_converter = [n_rows] __device__(size_type in) { return in < 0 ? in + n_rows : in; }; return gather(source_table, thrust::make_transform_iterator(map_begin, idx_converter), thrust::make_transform_iterator(map_end, idx_converter), From 8e4f193d5fd72ac44dd44bc01d868c0b9fb62008 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 09:52:01 -0500 Subject: [PATCH 027/138] Use nullify gather in join --- python/cudf/cudf/_lib/copying.pyx | 12 +++++++++--- python/cudf/cudf/core/frame.py | 8 +++++--- python/cudf/cudf/core/join/join.py | 12 ++++++++---- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ad798a73ed2..ebd14510734 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -130,11 +130,16 @@ def copy_range(Column input_column, input_begin, input_end, target_begin) -def gather(Table source_table, Column gather_map, bool keep_index=True): +def gather( + Table source_table, + Column gather_map, + bool keep_index=True, + bool nullify=False +): if not pd.api.types.is_integer_dtype(gather_map.dtype): raise ValueError("Gather map is not integer dtype.") - if len(gather_map) > 0: + if len(gather_map) > 0 and not nullify: gm_min, gm_max = minmax(gather_map) if gm_min < -len(source_table) or gm_max >= len(source_table): raise IndexError(f"Gather map index with min {gm_min}," @@ -150,7 +155,8 @@ def gather(Table source_table, Column gather_map, bool keep_index=True): source_table_view = source_table.data_view() cdef column_view gather_map_view = gather_map.view() cdef cpp_copying.out_of_bounds_policy policy = ( - cpp_copying.out_of_bounds_policy.DONT_CHECK + cpp_copying.out_of_bounds_policy.NULLIFY if nullify + else cpp_copying.out_of_bounds_policy.DONT_CHECK ) with nogil: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2a1aed814fe..c881720ab9a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -27,7 +27,6 @@ min_scalar_type, ) - T = TypeVar("T", bound="Frame") if TYPE_CHECKING: @@ -587,12 +586,15 @@ def _get_columns_by_index(self, indices): data, columns=data.to_pandas_index(), index=self.index ) - def _gather(self, gather_map, keep_index=True): + def _gather(self, gather_map, keep_index=True, nullify=False): if not pd.api.types.is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") result = self.__class__._from_table( libcudf.copying.gather( - self, as_column(gather_map), keep_index=keep_index + self, + as_column(gather_map), + keep_index=keep_index, + nullify=nullify, ) ) result._copy_type_metadata(self) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index fa7970dc8fa..917a7a85486 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -239,10 +239,10 @@ def construct_result(self, left_rows, right_rows): # first construct the index: if self.left_index and not self.right_index: # TODO: only gather on index columns: - out_index = self.rhs.index._gather(right_rows) + out_index = self.rhs.index._gather(right_rows, nullify=True) elif self.right_index and not self.left_index: # TODO: only gather on index columns: - out_index = self.lhs.index._gather(left_rows) + out_index = self.lhs.index._gather(left_rows, nullify=True) else: out_index = None @@ -251,9 +251,13 @@ def construct_result(self, left_rows, right_rows): left_names, right_names = self.output_column_names() for lcol in left_names: - data[left_names[lcol]] = self.lhs[lcol].iloc[left_rows] + data[left_names[lcol]] = self.lhs[lcol]._gather( + left_rows, nullify=True + ) for rcol in right_names: - data[right_names[rcol]] = self.rhs[rcol].iloc[right_rows] + data[right_names[rcol]] = self.rhs[rcol]._gather( + right_rows, nullify=True + ) return cudf.DataFrame._from_data(data, index=out_index) def output_column_names(self): From 29fe140b112b5dee3ffb4128cdd13e9bf688e810 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 11:14:15 -0500 Subject: [PATCH 028/138] Handle outer joins better --- python/cudf/cudf/core/column/column.py | 9 +++++++-- python/cudf/cudf/core/join/join.py | 27 ++++++++++++++++++-------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d615a7cfae4..0f99395d919 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -807,7 +807,12 @@ def quantile( def median(self, skipna: bool = None) -> ScalarLike: raise TypeError(f"cannot perform median with type {self.dtype}") - def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T: + def take( + self: T, + indices: ColumnBase, + keep_index: bool = True, + nullify: bool = False, + ) -> T: """Return Column by taking values from the corresponding *indices*. """ # Handle zero size @@ -816,7 +821,7 @@ def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T: try: return ( self.as_frame() - ._gather(indices, keep_index=keep_index) + ._gather(indices, keep_index=keep_index, nullify=nullify) ._as_column() ) except RuntimeError as e: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 917a7a85486..2232d02acd9 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -237,12 +237,10 @@ def construct_result(self, left_rows, right_rows): self.match_key_dtypes(_libcudf_to_output_castrules) # first construct the index: - if self.left_index and not self.right_index: - # TODO: only gather on index columns: - out_index = self.rhs.index._gather(right_rows, nullify=True) - elif self.right_index and not self.left_index: - # TODO: only gather on index columns: + if self.left_index: out_index = self.lhs.index._gather(left_rows, nullify=True) + elif self.right_index: + out_index = self.rhs.index._gather(right_rows, nullify=True) else: out_index = None @@ -258,7 +256,20 @@ def construct_result(self, left_rows, right_rows): data[right_names[rcol]] = self.rhs[rcol]._gather( right_rows, nullify=True ) - return cudf.DataFrame._from_data(data, index=out_index) + + result = cudf.DataFrame._from_data(data, index=out_index) + + # if outer join, key columns are combine: + for lkey, rkey in zip(*self._keys): + # get the key column as it appears in the result: + out_key = JoinKey(result, column=lkey.column, index=lkey.index) + + # fill nulls in the key column with values from the RHS + out_key.set_value( + out_key.value.fillna(rkey.value.take(right_rows, nullify=True)) + ) + + return result def output_column_names(self): # Return mappings of input column names to (possibly) suffixed @@ -275,7 +286,7 @@ def output_column_names(self): key_columns_with_same_name = self.on else: key_columns_with_same_name = [] - for lkey, rkey in zip(self._keys.left, self._keys.right): + for lkey, rkey in zip(*self._keys): if (lkey.is_index_level, rkey.is_index_level) == ( False, False, @@ -355,7 +366,7 @@ def match_key_dtypes(self, match_func): # match the dtypes of the key columns in # self.lhs and self.rhs according to the matching # function `match_func` - for left_key, right_key in zip(self._keys.left, self._keys.right): + for left_key, right_key in zip(*self._keys): lcol, rcol = left_key.value, right_key.value dtype = match_func(lcol, rcol, how=self.how) left_key.set_value(lcol.astype(dtype)) From b63405517c875b84aca3e0058d733c38a0045773 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 11:41:43 -0500 Subject: [PATCH 029/138] Fix index construction --- python/cudf/cudf/core/join/join.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 2232d02acd9..adff223116d 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -236,11 +236,18 @@ def perform_merge(self): def construct_result(self, left_rows, right_rows): self.match_key_dtypes(_libcudf_to_output_castrules) - # first construct the index: - if self.left_index: - out_index = self.lhs.index._gather(left_rows, nullify=True) - elif self.right_index: + # first construct the index. + if self.left_index and self.right_index: + if self.how == "right": + out_index = self.rhs.index._gather(left_rows, nullify=True) + else: + out_index = self.lhs.index._gather(left_rows, nullify=True) + elif self.left_index: + # left_index and right_on out_index = self.rhs.index._gather(right_rows, nullify=True) + elif self.right_index: + # right_index and left_on + out_index = self.lhs.index._gather(left_rows, nullify=True) else: out_index = None From cd53d6c2770f4693f38e7e2c9bd15f449ef3a703 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 12:01:39 -0500 Subject: [PATCH 030/138] Fix sorting behaviour --- python/cudf/cudf/core/frame.py | 34 +--------------------------- python/cudf/cudf/core/join/join.py | 36 +++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c881720ab9a..ccbf2cd10b6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3421,39 +3421,7 @@ def _merge( indicator, suffixes, ) - to_return = mergeop.perform_merge() - - # If sort=True, Pandas would sort on the key columns in the - # same order as given in 'on'. If the indices are used as - # keys, the index will be sorted. If one index is specified, - # the key column on the other side will be used to sort. - # If no index is specified, return a new RangeIndex - if sort: - to_sort = cudf.DataFrame() - if left_index and right_index: - by = list(to_return._index._data.columns) - if left_on and right_on: - by.extend(to_return[mergeop.left_on]._data.columns) - elif left_index: - by = list(to_return[mergeop.right_on]._data.columns) - elif right_index: - by = list(to_return[mergeop.left_on]._data.columns) - else: - # left_on == right_on, or different names but same columns - # in both cases we can sort by either - by = [to_return._data[name] for name in mergeop.left_on] - for i, col in enumerate(by): - to_sort[i] = col - inds = to_sort.argsort() - if isinstance(to_return, cudf.Index): - to_return = to_return.take(inds) - else: - to_return = to_return.take( - inds, keep_index=(left_index or right_index) - ) - return to_return - else: - return to_return + return mergeop.perform_merge() def _is_sorted(self, ascending=None, null_position=None): """ diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index adff223116d..fb61027bcbd 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -153,6 +153,7 @@ def __init__( self.left_index = left_index self.right_index = right_index self.how = how + self.sort = sort self.lsuffix = lsuffix self.rsuffix = rsuffix self.suffixes = suffixes @@ -160,11 +161,6 @@ def __init__( self.compute_join_keys() def compute_join_keys(self): - def _coerce_to_tuple(obj): - if hasattr(obj, "__iter__") and not isinstance(obj, str): - return tuple(obj) - else: - return (obj,) if ( self.left_index @@ -276,6 +272,25 @@ def construct_result(self, left_rows, right_rows): out_key.value.fillna(rkey.value.take(right_rows, nullify=True)) ) + return self.sort_result(result) + + def sort_result(self, result): + # If sort=True, Pandas sorts on the key columns in the + # same order as given in 'on'. If the indices are used as + # keys, the index will be sorted. If one index is specified, + # the key columns on the other side will be used to sort. + if self.sort: + if self.on: + return result.sort_values( + _coerce_to_list(self.on), ignore_index=True + ) + elif self.left_index and self.right_index: + return result.sort_index() + elif self.left_index: + return result.sort_values(_coerce_to_list(self.right_on)) + else: + # self.right_index and self.left_on + return result.sort_values(_coerce_to_list(self.left_on)) return result def output_column_names(self): @@ -378,3 +393,14 @@ def match_key_dtypes(self, match_func): dtype = match_func(lcol, rcol, how=self.how) left_key.set_value(lcol.astype(dtype)) right_key.set_value(rcol.astype(dtype)) + + +def _coerce_to_tuple(obj): + if hasattr(obj, "__iter__") and not isinstance(obj, str): + return tuple(obj) + else: + return (obj,) + + +def _coerce_to_list(obj): + return list(_coerce_to_tuple(obj)) From 75f1efdc51fd0ccf0fe9d07f5e1991377cd23fb0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 16:13:11 -0500 Subject: [PATCH 031/138] Fix Index.join --- python/cudf/cudf/core/index.py | 4 +++ python/cudf/cudf/core/join/join.py | 41 +++++++++++++++++++---------- python/cudf/cudf/core/multiindex.py | 8 +++++- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e3899a403f1..78d43d0275b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1493,6 +1493,10 @@ def _from_table(cls, table): else: return as_index(table) + @classmethod + def _from_data(cls, data, index=None): + return cls._from_table(Frame(data=data)) + _accessors = set() # type: Set[Any] @property diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index fb61027bcbd..1223a082800 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -16,8 +16,8 @@ class _MISSING_TYPE: MISSING = _MISSING_TYPE() -class JoinKey: - # A JoinKey represents one column of a Series +class ColumnView: + # A ColumnView represents one column of a Series # or DataFrame - either an index column or a # data column @@ -30,9 +30,10 @@ def __init__(self, obj, column=MISSING, index=MISSING): def get_numeric_index(self): # get the position of the column (including any index columns) if self.index is MISSING: - return len(self.obj.index.names) + self.obj.columns.get_loc( - self.column + index_nlevels = ( + self.obj.index.nlevels if self.obj._index is not None else 0 ) + return index_nlevels + tuple(self.obj._data).index(self.column) else: return self.obj.index.names.index(self.index) @@ -158,6 +159,14 @@ def __init__( self.rsuffix = rsuffix self.suffixes = suffixes + self.out_class = cudf.DataFrame + if isinstance(self.lhs, cudf.MultiIndex) or isinstance( + self.rhs, cudf.MultiIndex + ): + self.out_class = cudf.MultiIndex + elif isinstance(self.lhs, cudf.Index): + self.out_class = self.lhs.__class__ + self.compute_join_keys() def compute_join_keys(self): @@ -170,24 +179,24 @@ def compute_join_keys(self): ): if self.left_index: left_keys = [ - JoinKey(obj=self.lhs, index=on) + ColumnView(obj=self.lhs, index=on) for on in self.lhs.index.names ] else: # TODO: require left_on or left_index to be specified left_keys = [ - JoinKey(obj=self.lhs, column=on) + ColumnView(obj=self.lhs, column=on) for on in _coerce_to_tuple(self.left_on) ] if self.right_index: right_keys = [ - JoinKey(obj=self.rhs, index=on) + ColumnView(obj=self.rhs, index=on) for on in self.rhs.index.names ] else: # TODO: require right_on or right_index to be specified right_keys = [ - JoinKey(obj=self.rhs, column=on) + ColumnView(obj=self.rhs, column=on) for on in _coerce_to_tuple(self.right_on) ] else: @@ -198,8 +207,12 @@ def compute_join_keys(self): if self.on is not None else set(self.lhs._data.keys()) & set(self.rhs._data.keys()) ) - left_keys = [JoinKey(obj=self.lhs, column=on) for on in on_names] - right_keys = [JoinKey(obj=self.rhs, column=on) for on in on_names] + left_keys = [ + ColumnView(obj=self.lhs, column=on) for on in on_names + ] + right_keys = [ + ColumnView(obj=self.rhs, column=on) for on in on_names + ] if len(left_keys) != len(right_keys): raise ValueError( @@ -252,20 +265,20 @@ def construct_result(self, left_rows, right_rows): left_names, right_names = self.output_column_names() for lcol in left_names: - data[left_names[lcol]] = self.lhs[lcol]._gather( + data[left_names[lcol]] = self.lhs._data[lcol].take( left_rows, nullify=True ) for rcol in right_names: - data[right_names[rcol]] = self.rhs[rcol]._gather( + data[right_names[rcol]] = self.rhs._data[rcol].take( right_rows, nullify=True ) - result = cudf.DataFrame._from_data(data, index=out_index) + result = self.out_class._from_data(data, index=out_index) # if outer join, key columns are combine: for lkey, rkey in zip(*self._keys): # get the key column as it appears in the result: - out_key = JoinKey(result, column=lkey.column, index=lkey.index) + out_key = ColumnView(result, column=lkey.column, index=lkey.index) # fill nulls in the key column with values from the RHS out_key.set_value( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4e82a1f72b0..b72fa748cff 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,6 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +from __future__ import annotations + import itertools import numbers import pickle @@ -16,6 +18,7 @@ from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries from cudf.core.column import column +from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import Index, as_index @@ -232,7 +235,6 @@ def rename(self, names, inplace=False): ValueError: Length of names must match number of levels in MultiIndex. """ - return self.set_names(names, level=None, inplace=inplace) def set_names(self, names, level=None, inplace=False): @@ -276,6 +278,10 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) + @classmethod + def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex: + return cls.from_frame(cudf.DataFrame._from_data(data)) + @classmethod def _from_table(cls, table, names=None): df = cudf.DataFrame(table._data) From 1f5d6ad0c3bd8ac938b366506275490f13498817 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 17:42:53 -0500 Subject: [PATCH 032/138] Progress on semi/anti joins --- cpp/src/join/semi_join.cu | 5 +- python/cudf/cudf/_lib/join.pyx | 7 +- python/cudf/cudf/core/join/join.py | 100 +++++++++++++++++++++++++---- 3 files changed, 95 insertions(+), 17 deletions(-) diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 59298c75f1e..db60e74f81b 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -17,6 +17,8 @@ #include #include +#include + #include #include #include @@ -100,7 +102,8 @@ std::unique_ptr left_semi_anti_join( return (pos != hash_table.end()) == join_type_boolean; }); - auto join_size = std::distance(gather_map.begin(), gather_map_end); + auto join_size = thrust::distance(gather_map.begin(), gather_map_end); + std::cout << join_size << std::endl; return std::make_unique(cudf::data_type(type_to_id()), join_size, gather_map.release(), diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 5d07604f3bb..41b59e3d2e7 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -55,7 +55,7 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): ) -cpdef join_semi_anti(Table lhs, Table rhs, left_on, right_on, how=None): +cpdef semi_join(Table lhs, Table rhs, left_on, right_on, how=None): # left-semi and left-anti joins cdef vector[int] c_left_on = left_on cdef vector[int] c_right_on = right_on @@ -63,14 +63,14 @@ cpdef join_semi_anti(Table lhs, Table rhs, left_on, right_on, how=None): cdef table_view c_lhs = lhs.view() cdef table_view c_rhs = rhs.view() - if how == "semi": + if how == "leftsemi": c_result = move(cpp_join.left_semi_join( c_lhs, c_rhs, c_left_on, c_right_on )) - elif how == "anti": + elif how == "leftanti": c_result = move(cpp_join.left_anti_join( c_lhs, c_rhs, @@ -79,3 +79,4 @@ cpdef join_semi_anti(Table lhs, Table rhs, left_on, right_on, how=None): )) else: raise ValueError(f"Invalid join type {how}") + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1223a082800..094e6e1b46e 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -69,7 +69,59 @@ def set_value(self, value): JoinKeys = namedtuple("JoinKeys", ["left", "right"]) -class Merge(object): +def Merge( + lhs, + rhs, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + how="inner", + sort=False, + lsuffix="_x", + rsuffix="_y", + method=None, + indicator=None, + suffixes=None, +): + if how not in {"leftsemi", "leftanti"}: + return MergeBase( + lhs, + rhs, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + sort=sort, + lsuffix=lsuffix, + rsuffix=rsuffix, + method=method, + indicator=indicator, + suffixes=suffixes, + ) + else: + return MergeSemi( + lhs, + rhs, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + sort=sort, + lsuffix=lsuffix, + rsuffix=rsuffix, + method=method, + indicator=indicator, + suffixes=suffixes, + ) + + +class MergeBase(object): def __init__( self, lhs, @@ -222,11 +274,6 @@ def compute_join_keys(self): self._keys = JoinKeys(left=left_keys, right=right_keys) def perform_merge(self): - """ - Call libcudf to perform a merge between the operands. If - necessary, cast the input key columns to compatible types. - Potentially also cast the output back to categorical. - """ self.match_key_dtypes(_input_to_libcudf_castrules_any) left_key_indices = [key.get_numeric_index() for key in self._keys.left] @@ -276,14 +323,19 @@ def construct_result(self, left_rows, right_rows): result = self.out_class._from_data(data, index=out_index) # if outer join, key columns are combine: - for lkey, rkey in zip(*self._keys): - # get the key column as it appears in the result: - out_key = ColumnView(result, column=lkey.column, index=lkey.index) + if self.how == "outer": + for lkey, rkey in zip(*self._keys): + # get the key column as it appears in the result: + out_key = ColumnView( + result, column=lkey.column, index=lkey.index + ) - # fill nulls in the key column with values from the RHS - out_key.set_value( - out_key.value.fillna(rkey.value.take(right_rows, nullify=True)) - ) + # fill nulls in the key column with values from the RHS + out_key.set_value( + out_key.value.fillna( + rkey.value.take(right_rows, nullify=True) + ) + ) return self.sort_result(result) @@ -408,6 +460,28 @@ def match_key_dtypes(self, match_func): right_key.set_value(rcol.astype(dtype)) +class MergeSemi(MergeBase): + def perform_merge(self): + self.match_key_dtypes(_input_to_libcudf_castrules_any) + + left_key_indices = [key.get_numeric_index() for key in self._keys.left] + right_key_indices = [ + key.get_numeric_index() for key in self._keys.right + ] + left_rows = libcudf.join.semi_join( + self.lhs, + self.rhs, + left_on=left_key_indices, + right_on=right_key_indices, + how=self.how, + ) + return self.construct_result(left_rows, cudf.core.column.as_column([])) + + def output_column_names(self): + left_names, _ = super().output_column_names() + return left_names, {} + + def _coerce_to_tuple(obj): if hasattr(obj, "__iter__") and not isinstance(obj, str): return tuple(obj) From de305200c388fa4575d7145f9c35dd4f035dbbd2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Feb 2021 18:57:28 -0500 Subject: [PATCH 033/138] Add simple join test --- cpp/tests/join/semi_join_tests.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp index a665d07ee3c..30ac1b57e55 100644 --- a/cpp/tests/join/semi_join_tests.cpp +++ b/cpp/tests/join/semi_join_tests.cpp @@ -842,3 +842,22 @@ TEST_F(JoinDictionaryTest, LeftAntiJoinWithNulls) auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); } + +TEST_F(JoinTest, LeftSemiSimple) +{ + column_wrapper a_0{1, 9, 0}; + column_wrapper a_1{1, 2, 3}; + auto table_a = cudf::table_view({a_0, a_1}); + + column_wrapper b_0{0, 1}; + column_wrapper b_1{1, 2}; + auto table_b = cudf::table_view({b_0, b_1}); + + auto result = cudf::left_anti_join(table_a, table_b, {0}, {0}, {0, 1}); + auto result_view = result->view(); + + column_wrapper expect_0{9}; + column_wrapper expect_1{2}; + auto expect = cudf::table_view({expect_0, expect_1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expect); +} From 66a0de5e3da5195e90ef4c2712f06baf8a1d44eb Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 11 Feb 2021 12:07:32 -0500 Subject: [PATCH 034/138] Semi-join fix --- cpp/src/join/semi_join.cu | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index db60e74f81b..8d91ee38725 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -37,32 +37,29 @@ namespace detail { template std::unique_ptr left_semi_anti_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty"); - CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty"); - CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); + CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty"); + CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); - auto const left_num_rows = left.num_rows(); - auto const right_num_rows = right.num_rows(); + auto const left_num_rows = left_keys.num_rows(); + auto const right_num_rows = right_keys.num_rows(); // Only care about existence, so we'll use an unordered map (other joins need a multimap) using hash_table_type = concurrent_unordered_map; // Create hash table containing all keys found in right table - auto right_rows_d = table_device_view::create(right, stream); + auto right_rows_d = table_device_view::create(right_keys, stream); size_t const hash_table_size = compute_hash_table_size(right_num_rows); row_hash hash_build{*right_rows_d}; row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; // Going to join it with left table - auto left_rows_d = table_device_view::create(left, stream); + auto left_rows_d = table_device_view::create(left_keys, stream); row_hash hash_probe{*left_rows_d}; row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; @@ -103,7 +100,6 @@ std::unique_ptr left_semi_anti_join( }); auto join_size = thrust::distance(gather_map.begin(), gather_map_end); - std::cout << join_size << std::endl; return std::make_unique(cudf::data_type(type_to_id()), join_size, gather_map.release(), @@ -160,6 +156,8 @@ std::unique_ptr left_semi_anti_join( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { + CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); + if (0 == return_columns.size()) { return empty_like(left.select(return_columns)); } if (is_trivial_join(left, right, left_on, right_on, JoinKind)) { @@ -181,8 +179,8 @@ std::unique_ptr left_semi_anti_join( auto const left_selected = matched.second.front(); auto const right_selected = matched.second.back(); - auto gather_map = left_semi_anti_join( - left_selected, right_selected, left_on, right_on, compare_nulls, stream); + auto gather_map = + left_semi_anti_join(left_selected, right_selected, compare_nulls, stream); auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather(left_updated.select(return_columns), @@ -217,7 +215,7 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + left.select(left_on), right.select(right_on), compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, @@ -242,7 +240,7 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + left.select(left_on), right.select(right_on), compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf From ca72295905f1838fc441aa5b7605e51337314655 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 11 Feb 2021 12:16:55 -0500 Subject: [PATCH 035/138] Only combine key columns in outer join if they have the same name --- python/cudf/cudf/core/join/join.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 094e6e1b46e..155cfdb99f7 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -322,20 +322,21 @@ def construct_result(self, left_rows, right_rows): result = self.out_class._from_data(data, index=out_index) - # if outer join, key columns are combine: + # if outer join, key columns with the same name are combined: if self.how == "outer": for lkey, rkey in zip(*self._keys): - # get the key column as it appears in the result: - out_key = ColumnView( - result, column=lkey.column, index=lkey.index - ) + if lkey.name == rkey.name: + # get the key column as it appears in the result: + out_key = ColumnView( + result, column=lkey.column, index=lkey.index + ) - # fill nulls in the key column with values from the RHS - out_key.set_value( - out_key.value.fillna( - rkey.value.take(right_rows, nullify=True) + # fill nulls in the key column with values from the RHS + out_key.set_value( + out_key.value.fillna( + rkey.value.take(right_rows, nullify=True) + ) ) - ) return self.sort_result(result) From ee2242dcfdebc9b386dd7ad4b4d3711a30ebfcbf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 11 Feb 2021 12:53:53 -0500 Subject: [PATCH 036/138] Handle when both _on and _index are provided --- python/cudf/cudf/core/join/join.py | 46 ++++++++++++++++++------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 155cfdb99f7..927bf6d9d4b 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -229,28 +229,38 @@ def compute_join_keys(self): or self.left_on or self.right_on ): + left_keys = [] + right_keys = [] if self.left_index: - left_keys = [ - ColumnView(obj=self.lhs, index=on) - for on in self.lhs.index.names - ] - else: + left_keys.extend( + [ + ColumnView(obj=self.lhs, index=on) + for on in self.lhs.index.names + ] + ) + if self.left_on: # TODO: require left_on or left_index to be specified - left_keys = [ - ColumnView(obj=self.lhs, column=on) - for on in _coerce_to_tuple(self.left_on) - ] + left_keys.extend( + [ + ColumnView(obj=self.lhs, column=on) + for on in _coerce_to_tuple(self.left_on) + ] + ) if self.right_index: - right_keys = [ - ColumnView(obj=self.rhs, index=on) - for on in self.rhs.index.names - ] - else: + right_keys.extend( + [ + ColumnView(obj=self.rhs, index=on) + for on in self.rhs.index.names + ] + ) + if self.right_on: # TODO: require right_on or right_index to be specified - right_keys = [ - ColumnView(obj=self.rhs, column=on) - for on in _coerce_to_tuple(self.right_on) - ] + right_keys.extend( + [ + ColumnView(obj=self.rhs, column=on) + for on in _coerce_to_tuple(self.right_on) + ] + ) else: # Use `on` if provided. Otherwise, # implicitly use identically named columns as the key columns: From e53172582d9e63ca64de4283804f8bf838ab105b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 11 Feb 2021 12:54:39 -0500 Subject: [PATCH 037/138] Fix sorting join result --- python/cudf/cudf/core/join/join.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 927bf6d9d4b..41830b7a80f 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -360,13 +360,27 @@ def sort_result(self, result): return result.sort_values( _coerce_to_list(self.on), ignore_index=True ) - elif self.left_index and self.right_index: - return result.sort_index() - elif self.left_index: - return result.sort_values(_coerce_to_list(self.right_on)) - else: - # self.right_index and self.left_on - return result.sort_values(_coerce_to_list(self.left_on)) + by = [] + if self.left_index and self.right_index: + by.extend(result.index._data.columns) + if self.left_on: + by.extend( + [ + result._data[col] + for col in _coerce_to_list(self.left_on) + ] + ) + if self.right_on: + by.extend( + [ + result._data[col] + for col in _coerce_to_list(self.right_on) + ] + ) + if by: + to_sort = cudf.DataFrame._from_columns(by) + sort_order = to_sort.argsort() + result = result.take(sort_order) return result def output_column_names(self): From 674095ce434bca80bc77409d5a22d4544653b0f4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 12 Feb 2021 07:26:49 -0500 Subject: [PATCH 038/138] whitespace --- python/cudf/cudf/core/join/casting_logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/join/casting_logic.py b/python/cudf/cudf/core/join/casting_logic.py index eb85cecd14d..acd8efca8a9 100644 --- a/python/cudf/cudf/core/join/casting_logic.py +++ b/python/cudf/cudf/core/join/casting_logic.py @@ -186,7 +186,7 @@ def _libcudf_to_output_castrules(lcol, rcol, how): l_is_cat = isinstance(ltype, CategoricalDtype) r_is_cat = isinstance(rtype, CategoricalDtype) - # we currently only need to do this for categorical variables + # we currently only need to do this for categorical variables if how == "inner": if l_is_cat and r_is_cat: merge_return_type = "category" From cbd9dc371fcf60f5404e667ae1a8cebfd6513a37 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 12 Feb 2021 14:08:04 -0500 Subject: [PATCH 039/138] Make construct_join_output_df work with column views --- cpp/src/join/hash_join.cu | 50 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 16a453d7068..d478209c3d6 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -406,7 +406,7 @@ template std::pair, std::unique_ptr
> construct_join_output_df( table_view const &probe, table_view const &build, - VectorPair &joined_indices, + std::pair &joined_indices, std::vector> const &columns_in_common, cudf::hash_join::common_columns_output_side common_columns_output_side, rmm::cuda_stream_view stream, @@ -433,26 +433,28 @@ std::pair, std::unique_ptr
> construct_join_output_ // Construct the joined columns if (join_kind::FULL_JOIN == JoinKind) { if (not columns_in_common.empty()) { - auto common_from_build = detail::gather(build.select(build_common_col), - joined_indices.second.begin() + probe.num_rows(), - joined_indices.second.end(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - auto common_from_probe = detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.begin() + probe.num_rows(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - common_table = cudf::detail::concatenate( + auto common_from_build = + detail::gather(build.select(build_common_col), + joined_indices.second.begin() + probe.num_rows(), + joined_indices.second.end(), + bounds_policy, + stream, + rmm::mr::get_current_device_resource()); + auto common_from_probe = + detail::gather(probe.select(probe_common_col), + joined_indices.first.begin(), + joined_indices.first.begin() + probe.num_rows(), + bounds_policy, + stream, + rmm::mr::get_current_device_resource()); + common_table = cudf::detail::concatenate( {common_from_probe->view(), common_from_build->view()}, stream, mr); } } else { if (not columns_in_common.empty()) { common_table = detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.end(), + joined_indices.first.begin(), + joined_indices.first.end(), bounds_policy, stream, mr); @@ -461,15 +463,15 @@ std::pair, std::unique_ptr
> construct_join_output_ // Construct the probe non common columns std::unique_ptr
probe_table = detail::gather(probe.select(probe_noncommon_col), - joined_indices.first.begin(), - joined_indices.first.end(), + joined_indices.first.begin(), + joined_indices.first.end(), bounds_policy, stream, mr); std::unique_ptr
build_table = detail::gather(build.select(build_noncommon_col), - joined_indices.second.begin(), - joined_indices.second.end(), + joined_indices.second.begin(), + joined_indices.second.end(), bounds_policy, stream, mr); @@ -688,15 +690,17 @@ hash_join::hash_join_impl::compute_hash_join( }), "Invalid values passed to columns_in_common"); - auto joined_indices = - compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); + auto joined_indices = compute_hash_join(probe, probe_on, compare_nulls, stream, mr); if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); } + auto joined_indices_view = std::make_pair( + joined_indices.first->view(), joined_indices.second->view()); + return cudf::detail::construct_join_output_df( - probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr); + probe, _build, joined_indices_view, columns_in_common, common_columns_output_side, stream, mr); } template From 3f3c3cb60d5f02bf62855a00dcc2ff7674750ebe Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 12 Feb 2021 15:25:53 -0500 Subject: [PATCH 040/138] Get rid of hash_join::left_join --- cpp/include/cudf/join.hpp | 30 ---- cpp/src/join/hash_join.cu | 108 +------------- cpp/src/join/hash_join.cuh | 114 +++++++++++++-- cpp/src/join/join.cu | 34 +++-- cpp/tests/join/join_tests.cpp | 260 +++++++++++++++++----------------- 5 files changed, 260 insertions(+), 286 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 0a42d28de2d..72cd4066cf1 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -539,36 +539,6 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** - * @brief Performs a left join by probing in the internal hash table. - * - * More details please @see cudf::left_join(). - * - * @param probe The probe table, from which the tuples are probed. - * @param probe_on The column indices from `probe` to join on. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `build_on`. - * @param compare_nulls Controls whether null join-key values should match or not. - * @param mr Device memory resource used to allocate the returned table and columns' device - * memory. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Result of joining `build` and `probe` tables on the columns - * specified by `build_on` and `probe_on`. The resulting table will be joined columns of - * `probe(including common columns)+build(excluding common columns)`. - */ - std::unique_ptr left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** * @brief Performs a full join on the specified columns of two * tables (`left`, `right`), and returns the row indices corresponding diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index d478209c3d6..4cc91368565 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -41,8 +41,8 @@ namespace detail { * @return vector A vector containing only the indices which are not present in * `common_column_indices` */ -auto non_common_column_indices(size_type num_columns, - std::vector const &common_column_indices) +std::vector non_common_column_indices( + size_type num_columns, std::vector const &common_column_indices) { CUDF_EXPECTS(common_column_indices.size() <= static_cast(num_columns), "Too many columns in common"); @@ -402,89 +402,6 @@ std::pair, std::unique_ptr
> combine_join_columns( * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, * `build(including common columns)`) if `common_columns_output_side` is `BUILD`. */ -template -std::pair, std::unique_ptr
> construct_join_output_df( - table_view const &probe, - table_view const &build, - std::pair &joined_indices, - std::vector> const &columns_in_common, - cudf::hash_join::common_columns_output_side common_columns_output_side, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - std::vector probe_common_col; - probe_common_col.reserve(columns_in_common.size()); - std::vector build_common_col; - build_common_col.reserve(columns_in_common.size()); - for (const auto &c : columns_in_common) { - probe_common_col.push_back(c.first); - build_common_col.push_back(c.second); - } - std::vector probe_noncommon_col = - non_common_column_indices(probe.num_columns(), probe_common_col); - std::vector build_noncommon_col = - non_common_column_indices(build.num_columns(), build_common_col); - - out_of_bounds_policy const bounds_policy = JoinKind != join_kind::INNER_JOIN - ? out_of_bounds_policy::NULLIFY - : out_of_bounds_policy::DONT_CHECK; - - std::unique_ptr
common_table = std::make_unique
(); - // Construct the joined columns - if (join_kind::FULL_JOIN == JoinKind) { - if (not columns_in_common.empty()) { - auto common_from_build = - detail::gather(build.select(build_common_col), - joined_indices.second.begin() + probe.num_rows(), - joined_indices.second.end(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - auto common_from_probe = - detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.begin() + probe.num_rows(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - common_table = cudf::detail::concatenate( - {common_from_probe->view(), common_from_build->view()}, stream, mr); - } - } else { - if (not columns_in_common.empty()) { - common_table = detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - mr); - } - } - - // Construct the probe non common columns - std::unique_ptr
probe_table = detail::gather(probe.select(probe_noncommon_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - mr); - - std::unique_ptr
build_table = detail::gather(build.select(build_noncommon_col), - joined_indices.second.begin(), - joined_indices.second.end(), - bounds_policy, - stream, - mr); - - return combine_join_columns(probe_table->release(), - probe_noncommon_col, - probe_common_col, - build_table->release(), - build_noncommon_col, - build_common_col, - common_table->release(), - common_columns_output_side); -} std::unique_ptr combine_table_pair(std::unique_ptr &&left, std::unique_ptr &&right) @@ -559,27 +476,6 @@ hash_join::hash_join_impl::left_join(cudf::table_view const &probe, probe, probe_on, compare_nulls, stream, mr); } -std::unique_ptr hash_join::hash_join_impl::left_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_FUNC_RANGE(); - auto probe_build_pair = - compute_hash_join(probe, - probe_on, - columns_in_common, - common_columns_output_side::PROBE, - compare_nulls, - stream, - mr); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); -} - std::pair, std::unique_ptr> hash_join::hash_join_impl::full_join(cudf::table_view const &probe, std::vector const &probe_on, diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index c33029dea55..16a4edf7d5f 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -15,6 +15,9 @@ */ #pragma once +#include +#include +#include #include #include @@ -191,9 +194,112 @@ get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stre return std::make_pair(std::move(left_indices), std::move(right_indices)); } +std::pair, std::unique_ptr
> get_empty_joined_table( + table_view const& probe, + table_view const& build, + std::vector> const& columns_in_common, + cudf::hash_join::common_columns_output_side common_columns_output_side); + std::unique_ptr combine_table_pair(std::unique_ptr&& left, std::unique_ptr&& right); +std::pair, std::unique_ptr
> combine_join_columns( + std::vector>&& probe_noncommon_cols, + std::vector const& probe_noncommon_col_indices, + std::vector const& probe_common_col_indices, + std::vector>&& build_noncommon_cols, + std::vector const& build_noncommon_col_indices, + std::vector const& build_common_col_indices, + std::vector>&& common_cols, + cudf::hash_join::common_columns_output_side common_columns_output_side); + +std::vector non_common_column_indices( + size_type num_columns, std::vector const& common_column_indices); + +template +std::pair, std::unique_ptr
> construct_join_output_df( + table_view const& probe, + table_view const& build, + std::pair& joined_indices, + std::vector> const& columns_in_common, + cudf::hash_join::common_columns_output_side common_columns_output_side, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::vector probe_common_col; + probe_common_col.reserve(columns_in_common.size()); + std::vector build_common_col; + build_common_col.reserve(columns_in_common.size()); + for (const auto& c : columns_in_common) { + probe_common_col.push_back(c.first); + build_common_col.push_back(c.second); + } + std::vector probe_noncommon_col = + non_common_column_indices(probe.num_columns(), probe_common_col); + std::vector build_noncommon_col = + non_common_column_indices(build.num_columns(), build_common_col); + + out_of_bounds_policy const bounds_policy = JoinKind != join_kind::INNER_JOIN + ? out_of_bounds_policy::NULLIFY + : out_of_bounds_policy::DONT_CHECK; + + std::unique_ptr
common_table = std::make_unique
(); + // Construct the joined columns + if (join_kind::FULL_JOIN == JoinKind) { + if (not columns_in_common.empty()) { + auto common_from_build = + detail::gather(build.select(build_common_col), + joined_indices.second.begin() + probe.num_rows(), + joined_indices.second.end(), + bounds_policy, + stream, + rmm::mr::get_current_device_resource()); + auto common_from_probe = + detail::gather(probe.select(probe_common_col), + joined_indices.first.begin(), + joined_indices.first.begin() + probe.num_rows(), + bounds_policy, + stream, + rmm::mr::get_current_device_resource()); + common_table = cudf::detail::concatenate( + {common_from_probe->view(), common_from_build->view()}, stream, mr); + } + } else { + if (not columns_in_common.empty()) { + common_table = detail::gather(probe.select(probe_common_col), + joined_indices.first.begin(), + joined_indices.first.end(), + bounds_policy, + stream, + mr); + } + } + + // Construct the probe non common columns + std::unique_ptr
probe_table = detail::gather(probe.select(probe_noncommon_col), + joined_indices.first.begin(), + joined_indices.first.end(), + bounds_policy, + stream, + mr); + + std::unique_ptr
build_table = detail::gather(build.select(build_noncommon_col), + joined_indices.second.begin(), + joined_indices.second.end(), + bounds_policy, + stream, + mr); + + return combine_join_columns(probe_table->release(), + probe_noncommon_col, + probe_common_col, + build_table->release(), + build_noncommon_col, + build_common_col, + common_table->release(), + common_columns_output_side); +} + } // namespace detail struct hash_join::hash_join_impl { @@ -254,14 +360,6 @@ struct hash_join::hash_join_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::unique_ptr left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - std::pair, std::unique_ptr> full_join( cudf::table_view const& probe, std::vector const& probe_on, diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 61a0a85ef50..ddac509d9ef 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -155,7 +155,28 @@ std::unique_ptr
left_join( table_view const right = scatter_columns(matched.second.back(), right_on, right_input); cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr); + auto join_indices = hj_obj.left_join(left, left_on, compare_nulls, stream, mr); + + if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::LEFT_JOIN)) { + auto probe_build_pair = get_empty_joined_table( + left, right, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), + std::move(probe_build_pair.second)); + } + + auto join_indices_view = std::make_pair( + join_indices.first->view(), join_indices.second->view()); + + auto probe_build_pair = construct_join_output_df( + left, + right, + join_indices_view, + columns_in_common, + cudf::hash_join::common_columns_output_side::PROBE, + stream, + mr); + + return combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } std::pair, std::unique_ptr> full_join( @@ -250,17 +271,6 @@ std::pair, std::unique_ptr> hash_joi return impl->left_join(probe, probe_on, compare_nulls, stream, mr); } -std::unique_ptr hash_join::left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); -} - std::pair, std::unique_ptr> hash_join::full_join( cudf::table_view const& probe, std::vector const& probe_on, diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index f0748e2ec29..9fbccbda795 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1117,136 +1117,136 @@ TEST_F(JoinTest, InnerJoinCornerCase) CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); } -TEST_F(JoinTest, HashJoinSequentialProbes) -{ - CVector cols1; - cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); - cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); - cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); - - Table t1(std::move(cols1)); - - cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL); - - { - CVector cols0; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); - cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - - Table t0(std::move(cols0)); - - auto result = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release()); - cols_gold.emplace_back( - strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release()); - cols_gold.emplace_back( - column_wrapper{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}} - .release()); - cols_gold.emplace_back( - column_wrapper{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}} - .release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - } - - { - CVector cols0; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); - cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - - Table t0(std::move(cols0)); - - auto result = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}.release()); - cols_gold.emplace_back( - strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}.release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - } - - { - CVector cols0; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - - Table t0(std::move(cols0)); - - auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}}); - auto joined_cols = probe_build_pair.first->release(); - auto build_cols = probe_build_pair.second->release(); - joined_cols.insert(joined_cols.end(), - std::make_move_iterator(build_cols.begin()), - std::make_move_iterator(build_cols.end())); - auto result = std::make_unique(std::move(joined_cols)); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - } - - { - CVector cols0; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - - Table t0(std::move(cols0)); - - auto probe_build_pair = hash_join.inner_join( - t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD); - auto joined_cols = probe_build_pair.second->release(); - auto probe_cols = probe_build_pair.first->release(); - joined_cols.insert(joined_cols.end(), - std::make_move_iterator(probe_cols.begin()), - std::make_move_iterator(probe_cols.end())); - auto result = std::make_unique(std::move(joined_cols)); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - } -} +// TEST_F(JoinTest, HashJoinSequentialProbes) +// { +// CVector cols1; +// cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); +// cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); +// cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); + +// Table t1(std::move(cols1)); + +// cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL); + +// { +// CVector cols0; +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); +// cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); +// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// Table t0(std::move(cols0)); + +// auto result = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// CVector cols_gold; +// cols_gold.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release()); +// cols_gold.emplace_back( +// strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release()); +// cols_gold.emplace_back( +// column_wrapper{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}} +// .release()); +// cols_gold.emplace_back( +// column_wrapper{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}} +// .release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } + +// { +// CVector cols0; +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); +// cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); +// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// Table t0(std::move(cols0)); + +// auto result = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// CVector cols_gold; +// cols_gold.emplace_back(column_wrapper{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}.release()); +// cols_gold.emplace_back( +// strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release()); +// cols_gold.emplace_back(column_wrapper{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}.release()); +// cols_gold.emplace_back(column_wrapper{{1, -1, -1, -1, -1}, {1, 0, 0, 0, +// 0}}.release()); Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } + +// { +// CVector cols0; +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); +// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// Table t0(std::move(cols0)); + +// auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}}); +// auto joined_cols = probe_build_pair.first->release(); +// auto build_cols = probe_build_pair.second->release(); +// joined_cols.insert(joined_cols.end(), +// std::make_move_iterator(build_cols.begin()), +// std::make_move_iterator(build_cols.end())); +// auto result = std::make_unique(std::move(joined_cols)); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// CVector cols_gold; +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); +// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); +// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } + +// { +// CVector cols0; +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); +// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// Table t0(std::move(cols0)); + +// auto probe_build_pair = hash_join.inner_join( +// t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD); +// auto joined_cols = probe_build_pair.second->release(); +// auto probe_cols = probe_build_pair.first->release(); +// joined_cols.insert(joined_cols.end(), +// std::make_move_iterator(probe_cols.begin()), +// std::make_move_iterator(probe_cols.end())); +// auto result = std::make_unique(std::move(joined_cols)); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// CVector cols_gold; +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); +// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } +// } struct JoinDictionaryTest : public cudf::test::BaseFixture { }; From 01415fc11fac50895e7d1586c6ccc36e0503e2e1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 12 Feb 2021 16:09:27 -0500 Subject: [PATCH 041/138] More join C++ cleanup --- cpp/include/cudf/join.hpp | 74 --------------------------- cpp/src/join/join.cu | 104 ++++++++++++++++++++++++-------------- 2 files changed, 65 insertions(+), 113 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 72cd4066cf1..09064ee24da 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -483,50 +483,6 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** - * @brief Performs an inner join by probing in the internal hash table. - * - * Given that it is sometimes desired to choose the small table to be the `build` side for an - * inner join,a (`probe`, `build`) table pair, which contains the probe and build portions of the - * logical joined table respectively, is returned so that caller can freely rearrange them to - * restore the logical `left` `right` order. This introduces some extra logic about where "common" - * columns should go, i.e. the legacy `cudf::inner_join()` API always outputs "common" columns in - * the `left` portion and the corresponding columns in the `right` portion are omitted. To better - * align with the legacy `cudf::inner_join()` API, a `common_columns_output_side` parameter is - * introduced to specify whether "common" columns should go in `probe` or `build` portion. - * - * More details please @see cudf::inner_join(). - * - * @param probe The probe table, from which the tuples are probed. - * @param probe_on The column indices from `probe` to join on. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns or `build_on` columns if `probe_output_side` is LEFT or RIGHT. - * Else, for every column in `probe_on` and `build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `build_on`. - * @param common_columns_output_side @see `common_columns_output_side`. - * @param compare_nulls Controls whether null join-key values should match or not. - * @param mr Device memory resource used to allocate the returned table and columns' device - * memory. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Table pair of (`probe`, `build`) of joining both tables on the columns - * specified by `probe_on` and `build_on`. The resulting table pair will be joined columns of - * (`probe(including common columns)`, `build(excluding common columns)`) if - * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, - * `build(including common columns)`) if `common_columns_output_side` is `BUILD`. - */ - std::pair, std::unique_ptr> inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** * @brief Performs a left join on the specified columns of two * tables (`left`, `right`), and returns the row indices corresponding @@ -551,36 +507,6 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - /** - * @brief Performs a full join by probing in the internal hash table. - * - * More details please @see cudf::full_join(). - * - * @param probe The probe table, from which the tuples are probed. - * @param probe_on The column indices from `probe` to join on. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `build_on`. - * @param compare_nulls Controls whether null join-key values should match or not. - * @param mr Device memory resource used to allocate the returned table and columns' device - * memory. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Result of joining `build` and `probe` tables on the columns - * specified by `build_on` and `probe_on`. The resulting table will be joined columns of - * `probe(including common columns)+build(excluding common columns)`. - */ - std::unique_ptr full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - private: struct hash_join_impl; const std::unique_ptr impl; diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index ddac509d9ef..10743f9326a 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -84,28 +84,57 @@ std::unique_ptr
inner_join( // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, left_on, compare_nulls, stream); + auto join_indices = hj_obj.inner_join(right, right_on, compare_nulls, stream, mr); + auto actual_columns_in_common = columns_in_common; std::for_each(actual_columns_in_common.begin(), actual_columns_in_common.end(), [](auto& pair) { std::swap(pair.first, pair.second); }); - auto probe_build_pair = hj_obj.inner_join(right, - right_on, - actual_columns_in_common, - cudf::hash_join::common_columns_output_side::BUILD, - compare_nulls, - stream, - mr); + + if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::INNER_JOIN)) { + auto probe_build_pair = get_empty_joined_table( + right, left, actual_columns_in_common, cudf::hash_join::common_columns_output_side::BUILD); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.second), + std::move(probe_build_pair.first)); + } + + auto join_indices_view = std::make_pair( + join_indices.first->view(), join_indices.second->view()); + + auto probe_build_pair = construct_join_output_df( + right, + left, + join_indices_view, + actual_columns_in_common, + cudf::hash_join::common_columns_output_side::BUILD, + stream, + mr); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.second), std::move(probe_build_pair.first)); } else { cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - auto probe_build_pair = hj_obj.inner_join(left, - left_on, - columns_in_common, - cudf::hash_join::common_columns_output_side::PROBE, - compare_nulls, - stream, - mr); + auto join_indices = hj_obj.inner_join(left, left_on, compare_nulls, stream, mr); + + if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::INNER_JOIN)) { + auto probe_build_pair = get_empty_joined_table( + left, right, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), + std::move(probe_build_pair.second)); + } + + auto join_indices_view = std::make_pair( + join_indices.first->view(), join_indices.second->view()); + + auto probe_build_pair = construct_join_output_df( + left, + right, + join_indices_view, + columns_in_common, + cudf::hash_join::common_columns_output_side::PROBE, + stream, + mr); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } @@ -223,7 +252,28 @@ std::unique_ptr
full_join( table_view const right = scatter_columns(matched.second.back(), right_on, right_input); cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr); + auto join_indices = hj_obj.full_join(left, left_on, compare_nulls, stream, mr); + + if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::FULL_JOIN)) { + auto probe_build_pair = get_empty_joined_table( + left, right, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), + std::move(probe_build_pair.second)); + } + + auto join_indices_view = std::make_pair( + join_indices.first->view(), join_indices.second->view()); + + auto probe_build_pair = construct_join_output_df( + left, + right, + join_indices_view, + columns_in_common, + cudf::hash_join::common_columns_output_side::PROBE, + stream, + mr); + + return combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } } // namespace detail @@ -248,19 +298,6 @@ std::pair, std::unique_ptr> hash_joi return impl->inner_join(probe, probe_on, compare_nulls, stream, mr); } -std::pair, std::unique_ptr> hash_join::inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->inner_join( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); -} - std::pair, std::unique_ptr> hash_join::left_join( cudf::table_view const& probe, std::vector const& probe_on, @@ -281,17 +318,6 @@ std::pair, std::unique_ptr> hash_joi return impl->full_join(probe, probe_on, compare_nulls, stream, mr); } -std::unique_ptr hash_join::full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); -} - // external APIs std::pair, std::unique_ptr> inner_join( From 618549255894ef4bc55d19918dd5293db1e8640e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Feb 2021 13:52:09 -0500 Subject: [PATCH 042/138] Even more cleaning --- cpp/include/cudf/join.hpp | 49 +- cpp/src/join/hash_join.cu | 261 +--- cpp/src/join/hash_join.cuh | 184 +-- cpp/src/join/join.cu | 318 ++-- cpp/src/join/join_common_utils.hpp | 8 +- cpp/src/join/semi_join.cu | 30 +- cpp/tests/join/join_tests.cpp | 2159 ++++++++++++++-------------- cpp/tests/join/semi_join_tests.cpp | 827 ----------- 8 files changed, 1260 insertions(+), 2576 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 09064ee24da..2707c60fa34 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -37,10 +37,8 @@ namespace cudf { * to the result. */ // TODO: explain this better std::pair, std::unique_ptr> inner_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -106,7 +104,6 @@ std::unique_ptr inner_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -116,10 +113,8 @@ std::unique_ptr inner_join( * to the result. */ // TODO: explain this better std::pair, std::unique_ptr> left_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -187,7 +182,6 @@ std::unique_ptr left_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -197,10 +191,8 @@ std::unique_ptr left_join( * to the result. */ // TODO: explain this better std::pair, std::unique_ptr> full_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -268,7 +260,6 @@ std::unique_ptr full_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -324,16 +315,13 @@ std::unique_ptr left_semi_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** TODO: document */ std::unique_ptr left_semi_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -389,16 +377,13 @@ std::unique_ptr left_anti_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** TODO: document */ std::unique_ptr left_anti_join( - cudf::table_view const& left, - cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -457,20 +442,9 @@ class hash_join { * @param stream CUDA stream used for device memory operations and kernel launches */ hash_join(cudf::table_view const& build, - std::vector const& build_on, null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - /** - * @brief Controls where common columns will be output for a inner join. - */ - enum class common_columns_output_side { - PROBE, ///< Common columns is output in the probe portion of the table pair returned by - ///< `inner_join`. - BUILD ///< Common columns is output in the build portion of the table pair returned by - ///< `inner_join`. - }; - /** * @brief Performs an inner join on the specified columns of two * tables (`left`, `right`), and returns the row indices corresponding @@ -478,7 +452,6 @@ class hash_join { */ // TODO: explain this better std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -490,7 +463,6 @@ class hash_join { */ // TODO: explain this better std::pair, std::unique_ptr> left_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -502,7 +474,6 @@ class hash_join { */ // TODO: explain this better std::pair, std::unique_ptr> full_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 4cc91368565..d8fe8870001 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -29,65 +29,11 @@ namespace cudf { namespace detail { -/** - * @brief Returns a vector with non-common indices which is set difference - * between `[0, num_columns)` and index values in common_column_indices - * - * @param num_columns The number of columns, which represents column indices - * from `[0, num_columns)` in a table - * @param common_column_indices A vector of common indices which needs to be - * excluded from `[0, num_columns)` - * - * @return vector A vector containing only the indices which are not present in - * `common_column_indices` - */ -std::vector non_common_column_indices( - size_type num_columns, std::vector const &common_column_indices) -{ - CUDF_EXPECTS(common_column_indices.size() <= static_cast(num_columns), - "Too many columns in common"); - std::vector all_column_indices(num_columns); - std::iota(std::begin(all_column_indices), std::end(all_column_indices), 0); - std::vector sorted_common_column_indices{common_column_indices}; - std::sort(std::begin(sorted_common_column_indices), std::end(sorted_common_column_indices)); - std::vector non_common_column_indices(num_columns - common_column_indices.size()); - std::set_difference(std::cbegin(all_column_indices), - std::cend(all_column_indices), - std::cbegin(sorted_common_column_indices), - std::cend(sorted_common_column_indices), - std::begin(non_common_column_indices)); - return non_common_column_indices; -} - std::pair, std::unique_ptr
> get_empty_joined_table( - table_view const &probe, - table_view const &build, - std::vector> const &columns_in_common, - cudf::hash_join::common_columns_output_side common_columns_output_side) + table_view const &probe, table_view const &build) { - std::vector columns_to_exclude(columns_in_common.size()); - std::transform(columns_in_common.begin(), - columns_in_common.end(), - columns_to_exclude.begin(), - [common_columns_output_side](auto &col) { - return common_columns_output_side == hash_join::common_columns_output_side::PROBE - ? col.second - : col.first; - }); - std::vector non_common_indices = non_common_column_indices( - common_columns_output_side == hash_join::common_columns_output_side::PROBE - ? build.num_columns() - : probe.num_columns(), - columns_to_exclude); std::unique_ptr
empty_probe = empty_like(probe); std::unique_ptr
empty_build = empty_like(build); - if (common_columns_output_side == hash_join::common_columns_output_side::PROBE) { - table_view empty_build_view = empty_build->select(non_common_indices); - empty_build = std::make_unique
(empty_build_view); - } else { - table_view empty_probe_view = empty_probe->select(non_common_indices); - empty_probe = std::make_unique
(empty_probe_view); - } return std::make_pair(std::move(empty_probe), std::move(empty_build)); } @@ -201,8 +147,6 @@ get_left_join_indices_complement(rmm::device_uvector &right_indices, * @throw cudf::logic_error if the number of columns in `build` table is 0. * @throw cudf::logic_error if the number of rows in `build` table is 0. * @throw cudf::logic_error if insertion to the hash table fails. - * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build` - * table. * * @param build Table of columns used to build join hash. * @param compare_nulls Controls whether null join-key values should match or not. @@ -321,88 +265,6 @@ std::pair, rmm::device_uvector> probe_ return std::make_pair(std::move(left_indices), std::move(right_indices)); } -/** - * @brief Combines the non common probe, common probe, non common build and common build - * columns in the correct order according to `common_columns_output_side` to form the joined - * (`probe`, `build`) table pair. - * - * @param probe_noncommon_cols Columns obtained by gathering non common probe columns. - * @param probe_noncommon_col_indices Output locations of non common probe columns in the probe - * portion. - * @param probe_common_col_indices Output locations of common probe columns in the probe portion. - * @param build_noncommon_cols Columns obtained by gathering non common build columns. - * @param build_noncommon_col_indices Output locations of non common build columns in the build - * portion. - * @param build_common_col_indices Output locations of common build columns in the build portion. - * @param common_cols Columns obtained by gathering common columns from `probe` and `build` tables - * in the build portion. - * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side. - * - * @return Table pair of (`probe`, `build`). - */ -std::pair, std::unique_ptr
> combine_join_columns( - std::vector> &&probe_noncommon_cols, - std::vector const &probe_noncommon_col_indices, - std::vector const &probe_common_col_indices, - std::vector> &&build_noncommon_cols, - std::vector const &build_noncommon_col_indices, - std::vector const &build_common_col_indices, - std::vector> &&common_cols, - cudf::hash_join::common_columns_output_side common_columns_output_side) -{ - if (common_columns_output_side == cudf::hash_join::common_columns_output_side::PROBE) { - std::vector> probe_cols(probe_noncommon_cols.size() + - common_cols.size()); - for (size_t i = 0; i < probe_noncommon_cols.size(); ++i) { - probe_cols.at(probe_noncommon_col_indices.at(i)) = std::move(probe_noncommon_cols.at(i)); - } - for (size_t i = 0; i < common_cols.size(); ++i) { - probe_cols.at(probe_common_col_indices.at(i)) = std::move(common_cols.at(i)); - } - return std::make_pair(std::make_unique(std::move(probe_cols)), - std::make_unique(std::move(build_noncommon_cols))); - } else { - std::vector> build_cols(build_noncommon_cols.size() + - common_cols.size()); - for (size_t i = 0; i < build_noncommon_cols.size(); ++i) { - build_cols.at(build_noncommon_col_indices.at(i)) = std::move(build_noncommon_cols.at(i)); - } - for (size_t i = 0; i < common_cols.size(); ++i) { - build_cols.at(build_common_col_indices.at(i)) = std::move(common_cols.at(i)); - } - return std::make_pair(std::make_unique(std::move(probe_noncommon_cols)), - std::make_unique(std::move(build_cols))); - } -} - -/** - * @brief Gathers rows from `probe` and `build` table and returns a (`probe`, `build`) table pair, - * which contains the probe and build portions of the logical joined table respectively. - * - * @tparam JoinKind The type of join to be performed - * - * @param probe Probe side table - * @param build build side table - * @param joined_indices Pair of vectors containing row indices from which - * `probe` and `build` tables are gathered. If any row index is out of bounds, - * the contribution in the output `table` will be NULL. - * @param columns_in_common is a vector of pairs of column indices - * from tables `probe` and `build` respectively, that are "in common". - * For "common" columns, only a single output column will be produced. - * For an inner or left join, the result will be gathered from the column in - * `probe`. For a full join, the result will be gathered from both common - * columns in `probe` and `build` and concatenated to form a single column. - * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side. - * - * @return Table pair of (`probe`, `build`) containing the rows from `probe` and - * `build` specified by `joined_indices`. - * Columns in `columns_in_common` will be included in either `probe` or `build` portion as - * `common_columns_output_side` indicates. Final form would look like - * (`probe(including common columns)`, `build(excluding common columns)`) if - * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, - * `build(including common columns)`) if `common_columns_output_side` is `BUILD`. - */ - std::unique_ptr combine_table_pair(std::unique_ptr &&left, std::unique_ptr &&right) { @@ -419,100 +281,53 @@ std::unique_ptr combine_table_pair(std::unique_ptr &&l hash_join::hash_join_impl::~hash_join_impl() = default; hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, - std::vector const &build_on, null_equality compare_nulls, rmm::cuda_stream_view stream) - : _build(build), - _build_selected(build.select(build_on)), - _build_on(build_on), - _hash_table(nullptr) + : _build(build), _hash_table(nullptr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(0 != _build.num_columns(), "Hash join build table is empty"); CUDF_EXPECTS(_build.num_rows() < cudf::detail::MAX_JOIN_SIZE, "Build column size is too big for hash join"); - if (_build_on.empty() || 0 == build.num_rows()) { return; } + if (0 == build.num_rows()) { return; } - _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); + _hash_table = build_join_hash_table(_build, compare_nulls, stream); } std::pair, std::unique_ptr> hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, - std::vector const &probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, compare_nulls, stream, mr); -} - -std::pair, std::unique_ptr> -hash_join::hash_join_impl::inner_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); + return compute_hash_join(probe, compare_nulls, stream, mr); } std::pair, std::unique_ptr> hash_join::hash_join_impl::left_join(cudf::table_view const &probe, - std::vector const &probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, compare_nulls, stream, mr); + return compute_hash_join(probe, compare_nulls, stream, mr); } std::pair, std::unique_ptr> hash_join::hash_join_impl::full_join(cudf::table_view const &probe, - std::vector const &probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, compare_nulls, stream, mr); -} - -std::unique_ptr hash_join::hash_join_impl::full_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_FUNC_RANGE(); - auto probe_build_pair = - compute_hash_join(probe, - probe_on, - columns_in_common, - common_columns_output_side::PROBE, - compare_nulls, - stream, - mr); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + return compute_hash_join(probe, compare_nulls, stream, mr); } template std::pair, rmm::device_uvector> hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &probe, - std::vector const &probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) const @@ -520,42 +335,39 @@ hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &pro CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, "Probe column size is too big for hash join"); - CUDF_EXPECTS(_build_on.size() == probe_on.size(), + CUDF_EXPECTS(_build.num_columns() == probe.num_columns(), "Mismatch in number of columns to be joined on"); - if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { + if (is_trivial_join(probe, _build, JoinKind)) { return std::make_pair(rmm::device_uvector{0, stream}, rmm::device_uvector{0, stream}); } - auto probe_selected = probe.select(probe_on); - CUDF_EXPECTS(std::equal(std::cbegin(_build_selected), - std::cend(_build_selected), - std::cbegin(probe_selected), - std::cend(probe_selected), + CUDF_EXPECTS(std::equal(std::cbegin(_build), + std::cend(_build), + std::cbegin(probe), + std::cend(probe), [](const auto &b, const auto &p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - return probe_join_indices(probe_selected, compare_nulls, stream); + return probe_join_indices(probe, compare_nulls, stream); } template std::pair, std::unique_ptr> hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, - std::vector const &probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) const { - auto join_indices = - compute_hash_join_indices(probe, probe_on, compare_nulls, stream, mr); - auto join_size = join_indices.first.size(); - auto left_map = std::make_unique(cudf::data_type(type_to_id()), + auto join_indices = compute_hash_join_indices(probe, compare_nulls, stream, mr); + auto join_size = join_indices.first.size(); + auto left_map = std::make_unique(cudf::data_type(type_to_id()), join_size, join_indices.first.release(), rmm::device_buffer{}, 0); - auto right_map = std::make_unique(cudf::data_type(type_to_id()), + auto right_map = std::make_unique(cudf::data_type(type_to_id()), join_size, join_indices.second.release(), rmm::device_buffer{}, @@ -564,41 +376,6 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, std::move(left_map), std::move(right_map)); } -template -std::pair, std::unique_ptr> -hash_join::hash_join_impl::compute_hash_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - CUDF_EXPECTS(std::all_of(columns_in_common.begin(), - columns_in_common.end(), - [this, &probe_on](auto pair) { - size_t p = std::find(probe_on.begin(), probe_on.end(), pair.first) - - probe_on.begin(); - size_t b = std::find(_build_on.begin(), _build_on.end(), pair.second) - - _build_on.begin(); - return (p != probe_on.size()) && (b != _build_on.size()) && (p == b); - }), - "Invalid values passed to columns_in_common"); - - auto joined_indices = compute_hash_join(probe, probe_on, compare_nulls, stream, mr); - - if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { - return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); - } - - auto joined_indices_view = std::make_pair( - joined_indices.first->view(), joined_indices.second->view()); - - return cudf::detail::construct_join_output_df( - probe, _build, joined_indices_view, columns_in_common, common_columns_output_side, stream, mr); -} - template std::pair, rmm::device_uvector> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, @@ -612,7 +389,7 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, CUDF_EXPECTS(_hash_table, "Hash table of hash join is null."); - auto build_table = cudf::table_device_view::create(_build_selected, stream); + auto build_table = cudf::table_device_view::create(_build, stream); auto probe_table = cudf::table_device_view::create(probe, stream); constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 16a4edf7d5f..d547d5190c4 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -195,111 +195,11 @@ get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stre } std::pair, std::unique_ptr
> get_empty_joined_table( - table_view const& probe, - table_view const& build, - std::vector> const& columns_in_common, - cudf::hash_join::common_columns_output_side common_columns_output_side); + table_view const& probe, table_view const& build); std::unique_ptr combine_table_pair(std::unique_ptr&& left, std::unique_ptr&& right); -std::pair, std::unique_ptr
> combine_join_columns( - std::vector>&& probe_noncommon_cols, - std::vector const& probe_noncommon_col_indices, - std::vector const& probe_common_col_indices, - std::vector>&& build_noncommon_cols, - std::vector const& build_noncommon_col_indices, - std::vector const& build_common_col_indices, - std::vector>&& common_cols, - cudf::hash_join::common_columns_output_side common_columns_output_side); - -std::vector non_common_column_indices( - size_type num_columns, std::vector const& common_column_indices); - -template -std::pair, std::unique_ptr
> construct_join_output_df( - table_view const& probe, - table_view const& build, - std::pair& joined_indices, - std::vector> const& columns_in_common, - cudf::hash_join::common_columns_output_side common_columns_output_side, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - std::vector probe_common_col; - probe_common_col.reserve(columns_in_common.size()); - std::vector build_common_col; - build_common_col.reserve(columns_in_common.size()); - for (const auto& c : columns_in_common) { - probe_common_col.push_back(c.first); - build_common_col.push_back(c.second); - } - std::vector probe_noncommon_col = - non_common_column_indices(probe.num_columns(), probe_common_col); - std::vector build_noncommon_col = - non_common_column_indices(build.num_columns(), build_common_col); - - out_of_bounds_policy const bounds_policy = JoinKind != join_kind::INNER_JOIN - ? out_of_bounds_policy::NULLIFY - : out_of_bounds_policy::DONT_CHECK; - - std::unique_ptr
common_table = std::make_unique
(); - // Construct the joined columns - if (join_kind::FULL_JOIN == JoinKind) { - if (not columns_in_common.empty()) { - auto common_from_build = - detail::gather(build.select(build_common_col), - joined_indices.second.begin() + probe.num_rows(), - joined_indices.second.end(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - auto common_from_probe = - detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.begin() + probe.num_rows(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - common_table = cudf::detail::concatenate( - {common_from_probe->view(), common_from_build->view()}, stream, mr); - } - } else { - if (not columns_in_common.empty()) { - common_table = detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - mr); - } - } - - // Construct the probe non common columns - std::unique_ptr
probe_table = detail::gather(probe.select(probe_noncommon_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - mr); - - std::unique_ptr
build_table = detail::gather(build.select(build_noncommon_col), - joined_indices.second.begin(), - joined_indices.second.end(), - bounds_policy, - stream, - mr); - - return combine_join_columns(probe_table->release(), - probe_noncommon_col, - probe_common_col, - build_table->release(), - build_noncommon_col, - build_common_col, - common_table->release(), - common_columns_output_side); -} - } // namespace detail struct hash_join::hash_join_impl { @@ -313,64 +213,37 @@ struct hash_join::hash_join_impl { private: cudf::table_view _build; - cudf::table_view _build_selected; - std::vector _build_on; std::unique_ptr> _hash_table; public: /** - * @brief Constructor that internally builds the hash table based on the given `build` table and - * column indices specified by `build_on` for subsequent probe calls. + * @brief Constructor that internally builds the hash table based on the given `build` table * * @throw cudf::logic_error if the number of columns in `build` table is 0. * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE. - * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build` - * table. * * @param build The build table, from which the hash table is built. - * @param build_on The column indices from `build` to join on. * @param compare_nulls Controls whether null join-key values should match or not. */ hash_join_impl(cudf::table_view const& build, - std::vector const& build_on, null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::pair, std::unique_ptr> inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; std::pair, std::unique_ptr> left_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; std::pair, std::unique_ptr> full_join( cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::unique_ptr full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -379,7 +252,6 @@ struct hash_join::hash_join_impl { template std::pair, rmm::device_uvector> compute_hash_join_indices(cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -387,58 +259,6 @@ struct hash_join::hash_join_impl { template std::pair, std::unique_ptr> compute_hash_join( cudf::table_view const& probe, - std::vector const& probe_on, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - /** - * @brief Performs hash join by probing the columns provided in `probe` as per - * the joining indices given in `probe_on` and returns a (`probe`, `_build`) table pair, which - * contains the probe and build portions of the logical joined table respectively. - * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (`P`, `B`) where `P` does not exist in `probe_on` or `B` does not exist in - * `_build_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (`P`, `B`) such that the location of `P` within `probe_on` is not equal to - * the location of `B` within `_build_on`. - * @throw cudf::logic_error if the number of elements in `probe_on` and - * `_build_on` are not equal. - * @throw cudf::logic_error if the number of columns in `probe` is 0. - * @throw cudf::logic_error if the number of rows in `probe` table exceeds MAX_JOIN_SIZE. - * @throw std::out_of_range if elements of `probe_on` exceed the number of columns in the `probe` - * table. - * @throw cudf::logic_error if types do not match between joining columns. - * - * @tparam JoinKind The type of join to be performed. - * - * @param probe The probe table. - * @param probe_on The column's indices from `probe` to join on. - * Column `i` from `probe_on` will be compared against column `i` of `_build_on`. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `_build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns. Else, for every column in `probe_on` and `_build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `_build_on`. - * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side. - * @param compare_nulls Controls whether null join-key values should match or not. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return Table pair of (`probe`, `_build`) of joining both tables on the columns - * specified by `probe_on` and `_build_on`. The resulting table pair will be joined columns of - * (`probe(including common columns)`, `_build(excluding common columns)`) if - * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, - * `_build(including common columns)`) if `common_columns_output_side` is `BUILD`. - */ - template - std::pair, std::unique_ptr> compute_hash_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 10743f9326a..9e61a924e03 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -29,8 +29,6 @@ namespace detail { std::pair, std::unique_ptr> inner_join( table_view const& left_input, table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -38,35 +36,33 @@ std::pair, std::unique_ptr> inner_jo // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - {left_input.select(left_on), right_input.select(right_on)}, + {left_input, right_input}, stream, rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones - auto const left = scatter_columns(matched.second.front(), left_on, left_input); - auto const right = scatter_columns(matched.second.back(), right_on, right_input); + auto const left = matched.second.front(); + auto const right = matched.second.back(); // For `inner_join`, we can freely choose either the `left` or `right` table to use for // building/probing the hash map. Because building is typically more expensive than probing, we // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { - cudf::hash_join hj_obj(left, left_on, compare_nulls, stream); - return hj_obj.inner_join(right, right_on, compare_nulls, stream, mr); + cudf::hash_join hj_obj(left, compare_nulls, stream); + return hj_obj.inner_join(right, compare_nulls, stream, mr); } else { - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.inner_join(left, left_on, compare_nulls, stream, mr); + cudf::hash_join hj_obj(right, compare_nulls, stream); + return hj_obj.inner_join(left, compare_nulls, stream, mr); } } -std::unique_ptr
inner_join( - table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
inner_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -83,68 +79,51 @@ std::unique_ptr
inner_join( // building/probing the hash map. Because building is typically more expensive than probing, we // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { - cudf::hash_join hj_obj(left, left_on, compare_nulls, stream); - auto join_indices = hj_obj.inner_join(right, right_on, compare_nulls, stream, mr); - - auto actual_columns_in_common = columns_in_common; - std::for_each(actual_columns_in_common.begin(), actual_columns_in_common.end(), [](auto& pair) { - std::swap(pair.first, pair.second); - }); - - if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::INNER_JOIN)) { - auto probe_build_pair = get_empty_joined_table( - right, left, actual_columns_in_common, cudf::hash_join::common_columns_output_side::BUILD); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.second), - std::move(probe_build_pair.first)); - } - + cudf::hash_join hj_obj(left.select(left_on), compare_nulls, stream); + auto join_indices = hj_obj.inner_join(right.select(right_on), compare_nulls, stream, mr); auto join_indices_view = std::make_pair( join_indices.first->view(), join_indices.second->view()); - - auto probe_build_pair = construct_join_output_df( - right, - left, - join_indices_view, - actual_columns_in_common, - cudf::hash_join::common_columns_output_side::BUILD, - stream, - mr); - - return cudf::detail::combine_table_pair(std::move(probe_build_pair.second), - std::move(probe_build_pair.first)); + std::unique_ptr
left_result = + detail::gather(left, + join_indices_view.second.template begin(), + join_indices_view.second.template end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + std::unique_ptr
right_result = + detail::gather(right, + join_indices_view.first.template begin(), + join_indices_view.first.template end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } else { - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - auto join_indices = hj_obj.inner_join(left, left_on, compare_nulls, stream, mr); - - if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::INNER_JOIN)) { - auto probe_build_pair = get_empty_joined_table( - left, right, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); - } - + cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); + auto join_indices = hj_obj.inner_join(left.select(left_on), compare_nulls, stream, mr); auto join_indices_view = std::make_pair( join_indices.first->view(), join_indices.second->view()); - - auto probe_build_pair = construct_join_output_df( - left, - right, - join_indices_view, - columns_in_common, - cudf::hash_join::common_columns_output_side::PROBE, - stream, - mr); - - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + std::unique_ptr
left_result = + detail::gather(left, + join_indices_view.first.template begin(), + join_indices_view.first.template end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + std::unique_ptr
right_result = + detail::gather(right, + join_indices_view.second.template begin(), + join_indices_view.second.template end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } } std::pair, std::unique_ptr> left_join( table_view const& left_input, table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -152,26 +131,24 @@ std::pair, std::unique_ptr> left_joi // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - {left_input.select(left_on), right_input.select(right_on)}, // these should match + {left_input, right_input}, // these should match stream, rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones - table_view const left = scatter_columns(matched.second.front(), left_on, left_input); - table_view const right = scatter_columns(matched.second.back(), right_on, right_input); + table_view const left = matched.second.front(); + table_view const right = matched.second.back(); - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.left_join(left, left_on, compare_nulls, stream, mr); + cudf::hash_join hj_obj(right, compare_nulls, stream); + return hj_obj.left_join(left, compare_nulls, stream, mr); } -std::unique_ptr
left_join( - table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
left_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -183,36 +160,38 @@ std::unique_ptr
left_join( table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - auto join_indices = hj_obj.left_join(left, left_on, compare_nulls, stream, mr); + cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); + auto join_indices = hj_obj.left_join(left.select(left_on), compare_nulls, stream, mr); - if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::LEFT_JOIN)) { - auto probe_build_pair = get_empty_joined_table( - left, right, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE); + if ((left_on.empty() || right_on.empty()) || + is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { + auto probe_build_pair = get_empty_joined_table(left, right); return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } auto join_indices_view = std::make_pair( join_indices.first->view(), join_indices.second->view()); - - auto probe_build_pair = construct_join_output_df( - left, - right, - join_indices_view, - columns_in_common, - cudf::hash_join::common_columns_output_side::PROBE, - stream, - mr); - - return combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); + std::unique_ptr
left_result = + detail::gather(left, + join_indices_view.first.template begin(), + join_indices_view.first.template end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + std::unique_ptr
right_result = + detail::gather(right, + join_indices_view.second.template begin(), + join_indices_view.second.template end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } std::pair, std::unique_ptr> full_join( table_view const& left_input, table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -220,26 +199,24 @@ std::pair, std::unique_ptr> full_joi // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - {left_input.select(left_on), right_input.select(right_on)}, // these should match + {left_input, right_input}, // these should match stream, rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones - table_view const left = scatter_columns(matched.second.front(), left_on, left_input); - table_view const right = scatter_columns(matched.second.back(), right_on, right_input); + table_view const left = matched.second.front(); + table_view const right = matched.second.back(); - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.full_join(left, left_on, compare_nulls, stream, mr); + cudf::hash_join hj_obj(right, compare_nulls, stream); + return hj_obj.full_join(left, compare_nulls, stream, mr); } -std::unique_ptr
full_join( - table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
full_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -251,29 +228,33 @@ std::unique_ptr
full_join( table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - auto join_indices = hj_obj.full_join(left, left_on, compare_nulls, stream, mr); + cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); + auto join_indices = hj_obj.full_join(left.select(left_on), compare_nulls, stream, mr); - if (is_trivial_join(left, right, left_on, right_on, cudf::detail::join_kind::FULL_JOIN)) { - auto probe_build_pair = get_empty_joined_table( - left, right, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE); + if ((left_on.empty() || right_on.empty()) || + is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { + auto probe_build_pair = get_empty_joined_table(left, right); return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } auto join_indices_view = std::make_pair( join_indices.first->view(), join_indices.second->view()); - - auto probe_build_pair = construct_join_output_df( - left, - right, - join_indices_view, - columns_in_common, - cudf::hash_join::common_columns_output_side::PROBE, - stream, - mr); - - return combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); + std::unique_ptr
left_result = + detail::gather(left, + join_indices_view.first.template begin(), + join_indices_view.first.template end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + std::unique_ptr
right_result = + detail::gather(right, + join_indices_view.second.template begin(), + join_indices_view.second.template end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } } // namespace detail @@ -281,41 +262,37 @@ std::unique_ptr
full_join( hash_join::~hash_join() = default; hash_join::hash_join(cudf::table_view const& build, - std::vector const& build_on, null_equality compare_nulls, rmm::cuda_stream_view stream) - : impl{std::make_unique(build, build_on, compare_nulls, stream)} + : impl{std::make_unique(build, compare_nulls, stream)} { } std::pair, std::unique_ptr> hash_join::inner_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->inner_join(probe, probe_on, compare_nulls, stream, mr); + return impl->inner_join(probe, compare_nulls, stream, mr); } std::pair, std::unique_ptr> hash_join::left_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->left_join(probe, probe_on, compare_nulls, stream, mr); + return impl->left_join(probe, compare_nulls, stream, mr); } std::pair, std::unique_ptr> hash_join::full_join( cudf::table_view const& probe, - std::vector const& probe_on, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->full_join(probe, probe_on, compare_nulls, stream, mr); + return impl->full_join(probe, compare_nulls, stream, mr); } // external APIs @@ -323,82 +300,67 @@ std::pair, std::unique_ptr> hash_joi std::pair, std::unique_ptr> inner_join( table_view const& left, table_view const& right, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::inner_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::inner_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr
inner_join( - table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
inner_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::inner_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } std::pair, std::unique_ptr> left_join( table_view const& left, table_view const& right, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::left_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr
left_join( - table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
left_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } std::pair, std::unique_ptr> full_join( table_view const& left, table_view const& right, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::full_join( - left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); + return detail::full_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr
full_join( - table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
full_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::full_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 917bcb9bdd5..1fcfffb96bb 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -51,14 +51,10 @@ using row_equality = cudf::row_equality_comparator; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; -inline bool is_trivial_join(table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - join_kind join_type) +inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) { // If there is nothing to join, then send empty table with all columns - if (left_on.empty() || right_on.empty()) { return true; } + if ((0 == left.num_columns()) || (0 == right.num_columns())) { return true; } // If left join and the left table is empty, return immediately if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 8d91ee38725..919311f41bf 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -133,8 +133,6 @@ std::unique_ptr left_semi_anti_join( * The column from `right` indicated by `right_on[i]` * will be compared against the column from `left` * indicated by `left_on[i]`. - * @param[in] return_columns A vector of column indices from `left` to - * include in the returned table. * @param[in] compare_nulls Controls whether null join-key values should match or not. * @param[in] mr Device memory resource to used to allocate the returned table's * device memory @@ -142,8 +140,7 @@ std::unique_ptr left_semi_anti_join( * @tparam join_kind Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN * * @returns Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table - * will contain `return_columns` from `left` that match in right. + * specified by `left_on` and `right_on`. */ template std::unique_ptr left_semi_anti_join( @@ -151,22 +148,19 @@ std::unique_ptr left_semi_anti_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); - if (0 == return_columns.size()) { return empty_like(left.select(return_columns)); } - - if (is_trivial_join(left, right, left_on, right_on, JoinKind)) { - return empty_like(left.select(return_columns)); + if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) { + return empty_like(left); } if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) { // Everything matches, just copy the proper columns from the left table - return std::make_unique
(left.select(return_columns), stream, mr); + return std::make_unique
(left, stream, mr); } // Make sure any dictionary columns have matched key sets. @@ -183,7 +177,7 @@ std::unique_ptr left_semi_anti_join( left_semi_anti_join(left_selected, right_selected, compare_nulls, stream); auto const left_updated = scatter_columns(left_selected, left_on, left); - return cudf::detail::gather(left_updated.select(return_columns), + return cudf::detail::gather(left_updated, gather_map->view().template begin(), gather_map->view().template end(), out_of_bounds_policy::DONT_CHECK, @@ -197,50 +191,44 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_semi_join(cudf::table_view const& left, cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left.select(left_on), right.select(right_on), compare_nulls, rmm::cuda_stream_default, mr); + left, right, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, cudf::table_view const& right, - std::vector const& left_on, - std::vector const& right_on, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left.select(left_on), right.select(right_on), compare_nulls, rmm::cuda_stream_default, mr); + left, right, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 9fbccbda795..ec15e5b03c7 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -62,59 +62,11 @@ TEST_F(JoinTest, EmptySentinelRepro) cudf::table_view left({left_first_col, left_second_col, left_third_col}); cudf::table_view right({right_first_col, right_second_col, right_third_col}); - auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2}, {{0, 0}, {1, 1}, {2, 2}}); + auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2}); EXPECT_EQ(result->num_rows(), 1); } -TEST_F(JoinTest, InvalidCommonColumnIndices) -{ - column_wrapper col0_0{{3, 1, 2, 0, 3}}; - column_wrapper col0_1{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}}; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - EXPECT_THROW(cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 1}, {1, 0}}), cudf::logic_error); -} - -TEST_F(JoinTest, FullJoinNoCommon) -{ - column_wrapper col0_0{{0, 1}}; - column_wrapper col1_0{{0, 2}}; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols1.push_back(col1_0.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - column_wrapper exp_col0_0{{0, 1, -1}, {1, 1, 0}}; - column_wrapper exp_col0_1{{0, -1, 2}, {1, 0, 1}}; - CVector exp_cols; - exp_cols.push_back(exp_col0_0.release()); - exp_cols.push_back(exp_col0_1.release()); - Table gold(std::move(exp_cols)); - - auto result = - cudf::full_join(t0, t1, {0}, {0}, std::vector>{}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} - TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) { column_wrapper col0_0{{3, 1, 2, 0, 3}}; @@ -136,8 +88,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = - cudf::left_join(t0, t1, {0}, {0}, std::vector>{}); + auto result = cudf::left_join(t0, t1, {0}, {0}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -183,19 +134,27 @@ TEST_F(JoinTest, FullJoinNoNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{2, 2, 0, 4, 3, 3, 1, 2, 0}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}); - column_wrapper col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""}, + {1, 1, 1, 1, 1, 0, 0, 0, 0}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; + strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"}, + {0, 0, 0, 0, 1, 1, 1, 1, 1}); + column_wrapper col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); @@ -224,19 +183,27 @@ TEST_F(JoinTest, FullJoinWithNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{2, 2, 0, -1, 3, 3, 1, 2, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}); - column_wrapper col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""}, + {1, 1, 1, 1, 1, 0, 0, 0, 0}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 0}}; + strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"}, + {0, 0, 0, 0, 1, 1, 1, 1, 1}); + column_wrapper col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); @@ -268,7 +235,7 @@ TEST_F(JoinTest, FullJoinOnNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -279,20 +246,26 @@ TEST_F(JoinTest, FullJoinOnNulls) cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); #endif - - column_wrapper col_gold_0{{ 2, 5, 3, -1}, - { 1, 1, 1, 0}}; - strcol_wrapper col_gold_1({ "s1", "s0", "s0", "s1"}); - column_wrapper col_gold_2{{ -1, -1, 0, 1}, - { 0, 0, 1, 1}}; - column_wrapper col_gold_3{{ 1, 4, 2, 8}, - { 1, 1, 1, 1}}; + + column_wrapper col_gold_0{{ 3, -1, -1, -1}, + { 1, 0, 0, 0}}; + strcol_wrapper col_gold_1{{ "s0", "s1", "", ""}, + { 1, 1, 0, 0}}; + column_wrapper col_gold_2{{ 0, 1, -1, -1}, + { 1, 1, 0, 0}}; + column_wrapper col_gold_3{{ 3, -1, 2, 5}, + { 1, 0, 1, 1}}; + strcol_wrapper col_gold_4{{ "s0", "s1", "s1", "s0"}}; + column_wrapper col_gold_5{{ 2, 8, 1, 4}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); @@ -306,22 +279,27 @@ TEST_F(JoinTest, FullJoinOnNulls) cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); #endif - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); // Repeat test with compare_nulls_equal=false, // as per SQL standard. - result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL); + result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); sorted_result = cudf::gather(result->view(), *result_sort_order); - col_gold_0 = {{ 2, 5, 3, -1, -1}, - { 1, 1, 1, 0, 0}}; - col_gold_1 = strcol_wrapper({ "s1", "s0", "s0", "s1", "s1"}); - col_gold_2 = {{ -1, -1, 0, -1, 1}, - { 0, 0, 1, 0, 1}}; - col_gold_3 = {{ 1, 4, 2, 8, -1}, - { 1, 1, 1, 1, 0}}; + col_gold_0 = {{ 3, -1, -1, -1, -1}, + { 1, 0, 0, 0, 0}}; + col_gold_1 = strcol_wrapper{{ "s0", "s1", "", "", ""}, + { 1, 1, 0, 0, 0}}; + col_gold_2 = {{ 0, 1, -1, -1, -1}, + { 1, 1, 0, 0, 0}}; + col_gold_3 = {{ 3, -1, 2, 5, -1}, + { 1, 0, 1, 1, 0}}; + col_gold_4 = strcol_wrapper{{ "s0", "", "s1", "s0", "s1"}, + { 1, 0, 1, 1, 1}}; + col_gold_5 = {{ 2, -1, 1, 4, 8}, + { 1, 0, 1, 1, 1}}; // clang-format on @@ -330,23 +308,26 @@ TEST_F(JoinTest, FullJoinOnNulls) cols_gold_nulls_unequal.push_back(col_gold_1.release()); cols_gold_nulls_unequal.push_back(col_gold_2.release()); cols_gold_nulls_unequal.push_back(col_gold_3.release()); + cols_gold_nulls_unequal.push_back(col_gold_4.release()); + cols_gold_nulls_unequal.push_back(col_gold_5.release()); + Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)}; gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, LeftJoinNoNulls) { - column_wrapper col0_0{{3, 1, 2, 0, 3}}; + column_wrapper col0_0({3, 1, 2, 0, 3}); strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; + column_wrapper col0_2({0, 1, 2, 4, 1}); - column_wrapper col1_0{{2, 2, 0, 4, 3}}; + column_wrapper col1_0({2, 2, 0, 4, 3}); strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; + column_wrapper col1_2({1, 0, 1, 2, 1}); CVector cols0, cols1; cols0.push_back(col0_0.release()); @@ -359,34 +340,38 @@ TEST_F(JoinTest, LeftJoinNoNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}); - column_wrapper col_gold_2{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; + column_wrapper col_gold_0({3, 1, 2, 0, 3}); + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"}); + column_wrapper col_gold_2({0, 1, 2, 4, 1}); + column_wrapper col_gold_3{{-1, -1, -1, -1, 3}, {0, 0, 0, 0, 1}}; + strcol_wrapper col_gold_4{{"", "", "", "", "s1"}, {0, 0, 0, 0, 1}}; + column_wrapper col_gold_5{{-1, -1, -1, -1, 1}, {0, 0, 0, 0, 1}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, LeftJoinWithNulls) { column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); + strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1}); column_wrapper col0_2{{0, 1, 2, 4, 1}}; column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, ); column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; CVector cols0, cols1; @@ -400,19 +385,24 @@ TEST_F(JoinTest, LeftJoinWithNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{3, 2, 1, 2, 0}, {1, 1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "", "s4"}, {1, 1, 1, 0, 1}); - column_wrapper col_gold_2{{0, 1, 1, 2, 4}, {1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; + column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}}; + strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1}, {1, 1, 1, 1, 1}}; + column_wrapper col_gold_3{{3, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; + strcol_wrapper col_gold_4{{"s1", "", "", "", ""}, {1, 0, 0, 0, 0}}; + column_wrapper col_gold_5{{1, 1, -1, 1, 1}, {1, 0, 1, 1, 1}}; + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); @@ -420,1118 +410,1125 @@ TEST_F(JoinTest, LeftJoinWithNulls) CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); } -TEST_F(JoinTest, LeftJoinOnNulls) -{ - // clang-format off - column_wrapper col0_0{{ 3, 1, 2}, - { 1, 0, 1}}; - strcol_wrapper col0_1({"s0", "s1", "s2" }); - column_wrapper col0_2{{ 0, 1, 2 }}; +// TEST_F(JoinTest, LeftJoinOnNulls) +// { +// // clang-format off +// column_wrapper col0_0{{ 3, 1, 2}, +// { 1, 0, 1}}; +// strcol_wrapper col0_1({"s0", "s1", "s2" }); +n// column_wrapper col0_2{{ 0, 1, 2 }}; + +// column_wrapper col1_0{{ 2, 5, 3, 7 }, +// { 1, 1, 1, 0 }}; +// strcol_wrapper col1_1({"s1", "s0", "s0", "s1" }); +// column_wrapper col1_2{{ 1, 4, 2, 8 }}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - column_wrapper col1_0{{ 2, 5, 3, 7 }, - { 1, 1, 1, 0 }}; - strcol_wrapper col1_1({"s1", "s0", "s0", "s1" }); - column_wrapper col1_2{{ 1, 4, 2, 8 }}; +// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// #if 0 +// std::cout << "Actual Results:\n"; +// cudf::test::print(sorted_result->get_column(0).view(), std::cout, ",\t\t"); +// cudf::test::print(sorted_result->get_column(1).view(), std::cout, ",\t\t"); +// cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); +// cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); +// #endif + +// column_wrapper col_gold_0{{ 3, -1, 2}, +// { 1, 0, 1}}; +// strcol_wrapper col_gold_1({ "s0", "s1", "s2"}, +// { 1, 1, 1}); +// column_wrapper col_gold_2{{ 0, 1, 2}, +// { 1, 1, 1}}; +// column_wrapper col_gold_3{{ 2, 8, -1}, +// { 1, 1, 0}}; + +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// cols_gold.push_back(col_gold_2.release()); +// cols_gold.push_back(col_gold_3.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + +// #if 0 +// std::cout << "Expected Results:\n"; +// cudf::test::print(sorted_gold->get_column(0).view(), std::cout, ",\t\t"); +// cudf::test::print(sorted_gold->get_column(1).view(), std::cout, ",\t\t"); +// cudf::test::print(sorted_gold->get_column(2).view(), std::cout, ",\t\t"); +// cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); +// #endif + +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + +// // Repeat test with compare_nulls_equal=false, +// // as per SQL standard. + +// result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, +// cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); +// sorted_result = cudf::gather(result->view(), *result_sort_order); + +// col_gold_0 = {{ 3, -1, 2}, +// { 1, 0, 1}}; +// col_gold_1 = strcol_wrapper({ "s0", "s1", "s2"}, +// { 1, 1, 1}); +// col_gold_2 = {{ 0, 1, 2}, +// { 1, 1, 1}}; +// col_gold_3 = {{ 2, -1, -1}, +// { 1, 0, 0}}; + +// // clang-format on +// CVector cols_gold_nulls_unequal; +// cols_gold_nulls_unequal.push_back(col_gold_0.release()); +// cols_gold_nulls_unequal.push_back(col_gold_1.release()); +// cols_gold_nulls_unequal.push_back(col_gold_2.release()); +// cols_gold_nulls_unequal.push_back(col_gold_3.release()); +// Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)}; + +// gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); +// sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); + +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); +// TEST_F(JoinTest, InnerJoinSizeOverflow) +// { +// auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); +// zero->set_valid(true); +// static_cast *>(zero.get())->set_value(0); - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// // Should cause size overflow, raise exception +// int32_t left = 4; +// int32_t right = 1073741825; - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// auto col0_0 = cudf::make_column_from_scalar(*zero, left); +// auto col1_0 = cudf::make_column_from_scalar(*zero, right); -#if 0 - std::cout << "Actual Results:\n"; - cudf::test::print(sorted_result->get_column(0).view(), std::cout, ",\t\t"); - cudf::test::print(sorted_result->get_column(1).view(), std::cout, ",\t\t"); - cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); - cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); -#endif - - column_wrapper col_gold_0{{ 3, -1, 2}, - { 1, 0, 1}}; - strcol_wrapper col_gold_1({ "s0", "s1", "s2"}, - { 1, 1, 1}); - column_wrapper col_gold_2{{ 0, 1, 2}, - { 1, 1, 1}}; - column_wrapper col_gold_3{{ 2, 8, -1}, - { 1, 1, 0}}; +// CVector cols0, cols1; +// cols0.push_back(std::move(col0_0)); +// cols1.push_back(std::move(col1_0)); - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - cols_gold.push_back(col_gold_2.release()); - cols_gold.push_back(col_gold_3.release()); - Table gold(std::move(cols_gold)); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}), cudf::logic_error); +// } -#if 0 - std::cout << "Expected Results:\n"; - cudf::test::print(sorted_gold->get_column(0).view(), std::cout, ",\t\t"); - cudf::test::print(sorted_gold->get_column(1).view(), std::cout, ",\t\t"); - cudf::test::print(sorted_gold->get_column(2).view(), std::cout, ",\t\t"); - cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); -#endif +// TEST_F(JoinTest, InnerJoinNoNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// column_wrapper col_gold_0{{3, 2, 2}}; +// strcol_wrapper col_gold_1({"s1", "s0", "s0"}); +// column_wrapper col_gold_2{{0, 2, 1}}; +// column_wrapper col_gold_3{{1, 0, 0}}; +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// cols_gold.push_back(col_gold_2.release()); +// cols_gold.push_back(col_gold_3.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } - // Repeat test with compare_nulls_equal=false, - // as per SQL standard. +// TEST_F(JoinTest, InnerJoinNonAlignedCommon) +// { +// CVector cols0, cols1; +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); +// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); +// cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); +// cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1"}).release()); +// cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); - result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL); - result_sort_order = cudf::sorted_order(result->view()); - sorted_result = cudf::gather(result->view(), *result_sort_order); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - col_gold_0 = {{ 3, -1, 2}, - { 1, 0, 1}}; - col_gold_1 = strcol_wrapper({ "s0", "s1", "s2"}, - { 1, 1, 1}); - col_gold_2 = {{ 0, 1, 2}, - { 1, 1, 1}}; - col_gold_3 = {{ 2, -1, -1}, - { 1, 0, 0}}; +// auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// CVector cols_gold; +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); +// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); +// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } - // clang-format on - CVector cols_gold_nulls_unequal; - cols_gold_nulls_unequal.push_back(col_gold_0.release()); - cols_gold_nulls_unequal.push_back(col_gold_1.release()); - cols_gold_nulls_unequal.push_back(col_gold_2.release()); - cols_gold_nulls_unequal.push_back(col_gold_3.release()); - Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)}; +// TEST_F(JoinTest, InnerJoinNonAlignedCommonSwap) +// { +// CVector cols0, cols1; +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); +// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); +// cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 5}}.release()); +// cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0"}).release()); +// cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1, 0}}.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); - sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); +// auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// CVector cols_gold; +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); +// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); +// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} +// TEST_F(JoinTest, InnerJoinWithNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); +// column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); -TEST_F(JoinTest, InnerJoinSizeOverflow) -{ - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - zero->set_valid(true); - static_cast *>(zero.get())->set_value(0); +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// column_wrapper col_gold_0{{3, 2}}; +// strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1}); +// column_wrapper col_gold_2{{0, 1}}; +// column_wrapper col_gold_3{{1, -1}, {1, 0}}; +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// cols_gold.push_back(col_gold_2.release()); +// cols_gold.push_back(col_gold_3.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } - // Should cause size overflow, raise exception - int32_t left = 4; - int32_t right = 1073741825; +// // Test to check join behaviour when join keys are null. +// TEST_F(JoinTest, InnerJoinOnNulls) +// { +// // clang-format off +// column_wrapper col0_0{{ 3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1({"s1", "s1", "s8", "s4", "s0"}, +// { 1, 1, 0, 1, 1}); +// column_wrapper col0_2{{ 0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{ 2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, +// { 1, 0, 1, 1, 1}); +// column_wrapper col1_2{{ 1, 0, 1, 2, 1}}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - auto col0_0 = cudf::make_column_from_scalar(*zero, left); - auto col1_0 = cudf::make_column_from_scalar(*zero, right); +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// column_wrapper col_gold_0 {{ 3, 2}}; +// strcol_wrapper col_gold_1 ({"s1", "s0"}, +// { 1, 0}); +// column_wrapper col_gold_2{{ 0, 2}}; +// column_wrapper col_gold_3{{ 1, 0}}; +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// cols_gold.push_back(col_gold_2.release()); +// cols_gold.push_back(col_gold_3.release()); +// Table gold(std::move(cols_gold)); + +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + +// // Repeat test with compare_nulls_equal=false, +// // as per SQL standard. + +// result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, +// cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); +// sorted_result = cudf::gather(result->view(), *result_sort_order); + +// col_gold_0 = {{ 3}}; +// col_gold_1 = strcol_wrapper({"s1"}, +// { 1}); +// col_gold_2 = {{ 0}}; +// col_gold_3 = {{ 1}}; + +// // clang-format on + +// CVector cols_gold_sql; +// cols_gold_sql.push_back(col_gold_0.release()); +// cols_gold_sql.push_back(col_gold_1.release()); +// cols_gold_sql.push_back(col_gold_2.release()); +// cols_gold_sql.push_back(col_gold_3.release()); +// Table gold_sql(std::move(cols_gold_sql)); + +// gold_sort_order = cudf::sorted_order(gold_sql.view()); +// sorted_gold = cudf::gather(gold_sql.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } - CVector cols0, cols1; - cols0.push_back(std::move(col0_0)); - cols1.push_back(std::move(col1_0)); +// // Empty Left Table +// TEST_F(JoinTest, EmptyLeftTableInnerJoin) +// { +// column_wrapper col0_0; +// column_wrapper col0_1; - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}), cudf::logic_error); -} +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); -TEST_F(JoinTest, InnerJoinNoNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; +// Table empty0(std::move(cols0)); +// Table t1(std::move(cols1)); - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; +// auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); +// } - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); +// TEST_F(JoinTest, EmptyLeftTableLeftJoin) +// { +// column_wrapper col0_0; +// column_wrapper col0_1; - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - column_wrapper col_gold_0{{3, 2, 2}}; - strcol_wrapper col_gold_1({"s1", "s0", "s0"}); - column_wrapper col_gold_2{{0, 2, 1}}; - column_wrapper col_gold_3{{1, 0, 0}}; - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - cols_gold.push_back(col_gold_2.release()); - cols_gold.push_back(col_gold_3.release()); - Table gold(std::move(cols_gold)); +// Table empty0(std::move(cols0)); +// Table t1(std::move(cols1)); - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} +// auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); +// } -TEST_F(JoinTest, InnerJoinNonAlignedCommon) -{ - CVector cols0, cols1; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); - cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1"}).release()); - cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); +// TEST_F(JoinTest, EmptyLeftTableLeftJoinNonAlignedCommon) +// { +// column_wrapper col0_0; - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// CVector cols0, cols1; +// cols0.emplace_back(col0_0.release()); +// cols1.emplace_back(col1_0.release()); +// cols1.emplace_back(col1_1.release()); - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - Table gold(std::move(cols_gold)); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} +// column_wrapper col_gold_0; +// column_wrapper col_gold_1; -TEST_F(JoinTest, InnerJoinNonAlignedCommonSwap) -{ - CVector cols0, cols1; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 5}}.release()); - cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0"}).release()); - cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1, 0}}.release()); +// CVector cols_gold; +// cols_gold.emplace_back(col_gold_0.release()); +// cols_gold.emplace_back(col_gold_1.release()); - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// Table gold(std::move(cols_gold)); - auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// auto result = cudf::left_join(t0, t1, {0}, {1}, {{0, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); +// } - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - Table gold(std::move(cols_gold)); +// TEST_F(JoinTest, EmptyLeftTableFullJoin) +// { +// column_wrapper col0_0; +// column_wrapper col0_1; - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -TEST_F(JoinTest, InnerJoinWithNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; +// Table empty0(std::move(cols0)); +// Table t1(std::move(cols1)); - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); +// auto result = cudf::full_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(t1, *result); +// } - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// // Empty Right Table +// TEST_F(JoinTest, EmptyRightTableInnerJoin) +// { +// column_wrapper col0_0{{2, 2, 0, 4, 3}}; +// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// column_wrapper col1_0; +// column_wrapper col1_1; - column_wrapper col_gold_0{{3, 2}}; - strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1}); - column_wrapper col_gold_2{{0, 1}}; - column_wrapper col_gold_3{{1, -1}, {1, 0}}; - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - cols_gold.push_back(col_gold_2.release()); - cols_gold.push_back(col_gold_3.release()); - Table gold(std::move(cols_gold)); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} +// Table t0(std::move(cols0)); +// Table empty1(std::move(cols1)); -// Test to check join behaviour when join keys are null. -TEST_F(JoinTest, InnerJoinOnNulls) -{ - // clang-format off - column_wrapper col0_0{{ 3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s8", "s4", "s0"}, - { 1, 1, 0, 1, 1}); - column_wrapper col0_2{{ 0, 1, 2, 4, 1}}; +// auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); +// } - column_wrapper col1_0{{ 2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, - { 1, 0, 1, 1, 1}); - column_wrapper col1_2{{ 1, 0, 1, 2, 1}}; +// TEST_F(JoinTest, EmptyRightTableInnerJoinNonAlignedCommon) +// { +// column_wrapper col0_0{{2, 2, 0, 4, 3}}; +// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); +// column_wrapper col1_0; - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// CVector cols0, cols1; +// cols0.emplace_back(col0_0.release()); +// cols0.emplace_back(col0_1.release()); +// cols1.emplace_back(col1_0.release()); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - column_wrapper col_gold_0 {{ 3, 2}}; - strcol_wrapper col_gold_1 ({"s1", "s0"}, - { 1, 0}); - column_wrapper col_gold_2{{ 0, 2}}; - column_wrapper col_gold_3{{ 1, 0}}; - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - cols_gold.push_back(col_gold_2.release()); - cols_gold.push_back(col_gold_3.release()); - Table gold(std::move(cols_gold)); +// column_wrapper col_gold_0; +// column_wrapper col_gold_1; - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - - // Repeat test with compare_nulls_equal=false, - // as per SQL standard. - - result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL); - result_sort_order = cudf::sorted_order(result->view()); - sorted_result = cudf::gather(result->view(), *result_sort_order); - - col_gold_0 = {{ 3}}; - col_gold_1 = strcol_wrapper({"s1"}, - { 1}); - col_gold_2 = {{ 0}}; - col_gold_3 = {{ 1}}; - - // clang-format on - - CVector cols_gold_sql; - cols_gold_sql.push_back(col_gold_0.release()); - cols_gold_sql.push_back(col_gold_1.release()); - cols_gold_sql.push_back(col_gold_2.release()); - cols_gold_sql.push_back(col_gold_3.release()); - Table gold_sql(std::move(cols_gold_sql)); - - gold_sort_order = cudf::sorted_order(gold_sql.view()); - sorted_gold = cudf::gather(gold_sql.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} - -// Empty Left Table -TEST_F(JoinTest, EmptyLeftTableInnerJoin) -{ - column_wrapper col0_0; - column_wrapper col0_1; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table empty0(std::move(cols0)); - Table t1(std::move(cols1)); - - auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); -} +// CVector cols_gold; +// cols_gold.emplace_back(col_gold_0.release()); +// cols_gold.emplace_back(col_gold_1.release()); -TEST_F(JoinTest, EmptyLeftTableLeftJoin) -{ - column_wrapper col0_0; - column_wrapper col0_1; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table empty0(std::move(cols0)); - Table t1(std::move(cols1)); - - auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); -} - -TEST_F(JoinTest, EmptyLeftTableLeftJoinNonAlignedCommon) -{ - column_wrapper col0_0; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; +// Table gold(std::move(cols_gold)); - CVector cols0, cols1; - cols0.emplace_back(col0_0.release()); - cols1.emplace_back(col1_0.release()); - cols1.emplace_back(col1_1.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - column_wrapper col_gold_0; - column_wrapper col_gold_1; - - CVector cols_gold; - cols_gold.emplace_back(col_gold_0.release()); - cols_gold.emplace_back(col_gold_1.release()); - - Table gold(std::move(cols_gold)); - - auto result = cudf::left_join(t0, t1, {0}, {1}, {{0, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -} - -TEST_F(JoinTest, EmptyLeftTableFullJoin) -{ - column_wrapper col0_0; - column_wrapper col0_1; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table empty0(std::move(cols0)); - Table t1(std::move(cols1)); - - auto result = cudf::full_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(t1, *result); -} - -// Empty Right Table -TEST_F(JoinTest, EmptyRightTableInnerJoin) -{ - column_wrapper col0_0{{2, 2, 0, 4, 3}}; - column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - column_wrapper col1_0; - column_wrapper col1_1; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table t0(std::move(cols0)); - Table empty1(std::move(cols1)); - - auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -} - -TEST_F(JoinTest, EmptyRightTableInnerJoinNonAlignedCommon) -{ - column_wrapper col0_0{{2, 2, 0, 4, 3}}; - column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - column_wrapper col1_0; - - CVector cols0, cols1; - cols0.emplace_back(col0_0.release()); - cols0.emplace_back(col0_1.release()); - cols1.emplace_back(col1_0.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - column_wrapper col_gold_0; - column_wrapper col_gold_1; - - CVector cols_gold; - cols_gold.emplace_back(col_gold_0.release()); - cols_gold.emplace_back(col_gold_1.release()); - - Table gold(std::move(cols_gold)); - - auto result = cudf::inner_join(t0, t1, {1}, {0}, {{1, 0}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -} - -TEST_F(JoinTest, EmptyRightTableLeftJoin) -{ - column_wrapper col0_0{{2, 2, 0, 4, 3}, {1, 1, 1, 1, 1}}; - column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - column_wrapper col1_0; - column_wrapper col1_1; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table t0(std::move(cols0)); - Table empty1(std::move(cols1)); - - auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); -} - -TEST_F(JoinTest, EmptyRightTableFullJoin) -{ - column_wrapper col0_0{{2, 2, 0, 4, 3}}; - column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; +// auto result = cudf::inner_join(t0, t1, {1}, {0}, {{1, 0}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); +// } - column_wrapper col1_0; - column_wrapper col1_1; +// TEST_F(JoinTest, EmptyRightTableLeftJoin) +// { +// column_wrapper col0_0{{2, 2, 0, 4, 3}, {1, 1, 1, 1, 1}}; +// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// column_wrapper col1_0; +// column_wrapper col1_1; - Table t0(std::move(cols0)); - Table empty1(std::move(cols1)); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); -} +// Table t0(std::move(cols0)); +// Table empty1(std::move(cols1)); -// Both tables empty -TEST_F(JoinTest, BothEmptyInnerJoin) -{ - column_wrapper col0_0; - column_wrapper col0_1; +// auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); +// } - column_wrapper col1_0; - column_wrapper col1_1; +// TEST_F(JoinTest, EmptyRightTableFullJoin) +// { +// column_wrapper col0_0{{2, 2, 0, 4, 3}}; +// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// column_wrapper col1_0; +// column_wrapper col1_1; - Table t0(std::move(cols0)); - Table empty1(std::move(cols1)); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -} +// Table t0(std::move(cols0)); +// Table empty1(std::move(cols1)); -TEST_F(JoinTest, BothEmptyLeftJoin) -{ - column_wrapper col0_0; - column_wrapper col0_1; +// auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); +// } - column_wrapper col1_0; - column_wrapper col1_1; +// // Both tables empty +// TEST_F(JoinTest, BothEmptyInnerJoin) +// { +// column_wrapper col0_0; +// column_wrapper col0_1; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// column_wrapper col1_0; +// column_wrapper col1_1; - Table t0(std::move(cols0)); - Table empty1(std::move(cols1)); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -} +// Table t0(std::move(cols0)); +// Table empty1(std::move(cols1)); -TEST_F(JoinTest, BothEmptyFullJoin) -{ - column_wrapper col0_0; - column_wrapper col0_1; +// auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); +// } - column_wrapper col1_0; - column_wrapper col1_1; +// TEST_F(JoinTest, BothEmptyLeftJoin) +// { +// column_wrapper col0_0; +// column_wrapper col0_1; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// column_wrapper col1_0; +// column_wrapper col1_1; - Table t0(std::move(cols0)); - Table empty1(std::move(cols1)); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -} +// Table t0(std::move(cols0)); +// Table empty1(std::move(cols1)); -// EqualValues X Inner,Left,Full +// auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); +// } -TEST_F(JoinTest, EqualValuesInnerJoin) -{ - column_wrapper col0_0{{0, 0}}; - strcol_wrapper col0_1({"s0", "s0"}); +// TEST_F(JoinTest, BothEmptyFullJoin) +// { +// column_wrapper col0_0; +// column_wrapper col0_1; - column_wrapper col1_0{{0, 0}}; - strcol_wrapper col1_1({"s0", "s0"}); +// column_wrapper col1_0; +// column_wrapper col1_1; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// Table t0(std::move(cols0)); +// Table empty1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); +// } - column_wrapper col_gold_0{{0, 0, 0, 0}}; - strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - Table gold(std::move(cols_gold)); +// // EqualValues X Inner,Left,Full - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -} +// TEST_F(JoinTest, EqualValuesInnerJoin) +// { +// column_wrapper col0_0{{0, 0}}; +// strcol_wrapper col0_1({"s0", "s0"}); -TEST_F(JoinTest, EqualValuesLeftJoin) -{ - column_wrapper col0_0{{0, 0}}; - strcol_wrapper col0_1({"s0", "s0"}); +// column_wrapper col1_0{{0, 0}}; +// strcol_wrapper col1_1({"s0", "s0"}); - column_wrapper col1_0{{0, 0}}; - strcol_wrapper col1_1({"s0", "s0"}); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// column_wrapper col_gold_0{{0, 0, 0, 0}}; +// strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// Table gold(std::move(cols_gold)); - column_wrapper col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - Table gold(std::move(cols_gold)); +// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); +// } - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -} +// TEST_F(JoinTest, EqualValuesLeftJoin) +// { +// column_wrapper col0_0{{0, 0}}; +// strcol_wrapper col0_1({"s0", "s0"}); -TEST_F(JoinTest, EqualValuesFullJoin) -{ - column_wrapper col0_0{{0, 0}}; - strcol_wrapper col0_1({"s0", "s0"}); +// column_wrapper col1_0{{0, 0}}; +// strcol_wrapper col1_1({"s0", "s0"}); - column_wrapper col1_0{{0, 0}}; - strcol_wrapper col1_1({"s0", "s0"}); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// column_wrapper col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}}; +// strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// Table gold(std::move(cols_gold)); - column_wrapper col_gold_0{{0, 0, 0, 0}}; - strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - cols_gold.push_back(col_gold_1.release()); - Table gold(std::move(cols_gold)); +// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); +// } - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -} +// TEST_F(JoinTest, EqualValuesFullJoin) +// { +// column_wrapper col0_0{{0, 0}}; +// strcol_wrapper col0_1({"s0", "s0"}); -TEST_F(JoinTest, InnerJoinCornerCase) -{ - column_wrapper col0_0{{4, 1, 3, 2, 2, 2, 2}}; - column_wrapper col1_0{{2}}; +// column_wrapper col1_0{{0, 0}}; +// strcol_wrapper col1_1({"s0", "s0"}); - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols1.push_back(col1_0.release()); +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); +// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - column_wrapper col_gold_0{{2, 2, 2, 2}}; - CVector cols_gold; - cols_gold.push_back(col_gold_0.release()); - Table gold(std::move(cols_gold)); +// column_wrapper col_gold_0{{0, 0, 0, 0}}; +// strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// cols_gold.push_back(col_gold_1.release()); +// Table gold(std::move(cols_gold)); - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} +// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); +// } -// TEST_F(JoinTest, HashJoinSequentialProbes) +// TEST_F(JoinTest, InnerJoinCornerCase) // { -// CVector cols1; -// cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); -// cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); -// cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); +// column_wrapper col0_0{{4, 1, 3, 2, 2, 2, 2}}; +// column_wrapper col1_0{{2}}; +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols1.push_back(col1_0.release()); + +// Table t0(std::move(cols0)); // Table t1(std::move(cols1)); -// cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL); +// auto result = cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}); +// auto result_sort_order = cudf::sorted_order(result->view()); +// auto sorted_result = cudf::gather(result->view(), *result_sort_order); -// { -// CVector cols0; -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); -// cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); -// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// Table t0(std::move(cols0)); - -// auto result = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// CVector cols_gold; -// cols_gold.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release()); -// cols_gold.emplace_back( -// strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release()); -// cols_gold.emplace_back( -// column_wrapper{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}} -// .release()); -// cols_gold.emplace_back( -// column_wrapper{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}} -// .release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } +// column_wrapper col_gold_0{{2, 2, 2, 2}}; +// CVector cols_gold; +// cols_gold.push_back(col_gold_0.release()); +// Table gold(std::move(cols_gold)); -// { -// CVector cols0; -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); -// cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); -// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// Table t0(std::move(cols0)); - -// auto result = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// CVector cols_gold; -// cols_gold.emplace_back(column_wrapper{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}.release()); -// cols_gold.emplace_back( -// strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release()); -// cols_gold.emplace_back(column_wrapper{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}.release()); -// cols_gold.emplace_back(column_wrapper{{1, -1, -1, -1, -1}, {1, 0, 0, 0, -// 0}}.release()); Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } +// auto gold_sort_order = cudf::sorted_order(gold.view()); +// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// } +// // TEST_F(JoinTest, HashJoinSequentialProbes) +// // { +// // CVector cols1; +// // cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); +// // cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); +// // cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); + +// // Table t1(std::move(cols1)); + +// // cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL); + +// // { +// // CVector cols0; +// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); +// // cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); +// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// // Table t0(std::move(cols0)); + +// // auto result = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}}); +// // auto result_sort_order = cudf::sorted_order(result->view()); +// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// // CVector cols_gold; +// // cols_gold.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release()); +// // cols_gold.emplace_back( +// // strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release()); +// // cols_gold.emplace_back( +// // column_wrapper{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}} +// // .release()); +// // cols_gold.emplace_back( +// // column_wrapper{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}} +// // .release()); +// // Table gold(std::move(cols_gold)); + +// // auto gold_sort_order = cudf::sorted_order(gold.view()); +// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// // } + +// // { +// // CVector cols0; +// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); +// // cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); +// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// // Table t0(std::move(cols0)); + +// // auto result = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}}); +// // auto result_sort_order = cudf::sorted_order(result->view()); +// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// // CVector cols_gold; +// // cols_gold.emplace_back(column_wrapper{{3, 3, 1, 2, 0}, {1, 1, 1, 1, +// 1}}.release()); +// // cols_gold.emplace_back( +// // strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release()); +// // cols_gold.emplace_back(column_wrapper{{1, 0, 1, 2, 4}, {1, 1, 1, 1, +// 1}}.release()); +// // cols_gold.emplace_back(column_wrapper{{1, -1, -1, -1, -1}, {1, 0, 0, 0, +// // 0}}.release()); Table gold(std::move(cols_gold)); + +// // auto gold_sort_order = cudf::sorted_order(gold.view()); +// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// // } + +// // { +// // CVector cols0; +// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// // cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); +// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// // Table t0(std::move(cols0)); + +// // auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}}); +// // auto joined_cols = probe_build_pair.first->release(); +// // auto build_cols = probe_build_pair.second->release(); +// // joined_cols.insert(joined_cols.end(), +// // std::make_move_iterator(build_cols.begin()), +// // std::make_move_iterator(build_cols.end())); +// // auto result = std::make_unique(std::move(joined_cols)); +// // auto result_sort_order = cudf::sorted_order(result->view()); +// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// // CVector cols_gold; +// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// // cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); +// // cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); +// // cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); +// // Table gold(std::move(cols_gold)); + +// // auto gold_sort_order = cudf::sorted_order(gold.view()); +// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// // } + +// // { +// // CVector cols0; +// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); +// // cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); +// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); + +// // Table t0(std::move(cols0)); + +// // auto probe_build_pair = hash_join.inner_join( +// // t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD); +// // auto joined_cols = probe_build_pair.second->release(); +// // auto probe_cols = probe_build_pair.first->release(); +// // joined_cols.insert(joined_cols.end(), +// // std::make_move_iterator(probe_cols.begin()), +// // std::make_move_iterator(probe_cols.end())); +// // auto result = std::make_unique(std::move(joined_cols)); +// // auto result_sort_order = cudf::sorted_order(result->view()); +// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); + +// // CVector cols_gold; +// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// // cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); +// // cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); +// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); +// // cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); +// // Table gold(std::move(cols_gold)); + +// // auto gold_sort_order = cudf::sorted_order(gold.view()); +// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); +// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// // } +// // } + +// struct JoinDictionaryTest : public cudf::test::BaseFixture { +// }; + +// TEST_F(JoinDictionaryTest, LeftJoinNoNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 3}}; +// strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); +// auto col0_1 = cudf::dictionary::encode(col0_1_w); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; +// auto col1_1 = cudf::dictionary::encode(col1_1_w); +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); +// auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); +// auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); +// auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); // { -// CVector cols0; -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); -// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// Table t0(std::move(cols0)); - -// auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}}); -// auto joined_cols = probe_build_pair.first->release(); -// auto build_cols = probe_build_pair.second->release(); -// joined_cols.insert(joined_cols.end(), -// std::make_move_iterator(build_cols.begin()), -// std::make_move_iterator(build_cols.end())); -// auto result = std::make_unique(std::move(joined_cols)); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// CVector cols_gold; -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); -// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); -// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// auto result = +// cudf::left_join(t0, t1, {0}, {0}, std::vector>{}); +// auto result_view = result->view(); +// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); +// auto decoded4 = cudf::dictionary::decode(result_view.column(4)); +// std::vector result_decoded({result_view.column(0), +// decoded1->view(), +// result_view.column(2), +// result_view.column(3), +// decoded4->view(), +// result_view.column(5)}); + +// auto gold = +// cudf::left_join(g0, g1, {0}, {0}, std::vector>{}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); // } - // { -// CVector cols0; -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); -// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// Table t0(std::move(cols0)); - -// auto probe_build_pair = hash_join.inner_join( -// t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD); -// auto joined_cols = probe_build_pair.second->release(); -// auto probe_cols = probe_build_pair.first->release(); -// joined_cols.insert(joined_cols.end(), -// std::make_move_iterator(probe_cols.begin()), -// std::make_move_iterator(probe_cols.end())); -// auto result = std::make_unique(std::move(joined_cols)); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// CVector cols_gold; -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); -// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); +// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_view = result->view(); +// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); +// std::vector result_decoded( +// {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); + +// auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); // } // } -struct JoinDictionaryTest : public cudf::test::BaseFixture { -}; - -TEST_F(JoinDictionaryTest, LeftJoinNoNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 3}}; - strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); - auto col0_1 = cudf::dictionary::encode(col0_1_w); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; - auto col1_1 = cudf::dictionary::encode(col1_1_w); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; - - auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); - auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); - auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - { - auto result = - cudf::left_join(t0, t1, {0}, {0}, std::vector>{}); - auto result_view = result->view(); - auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - auto decoded4 = cudf::dictionary::decode(result_view.column(4)); - std::vector result_decoded({result_view.column(0), - decoded1->view(), - result_view.column(2), - result_view.column(3), - decoded4->view(), - result_view.column(5)}); - - auto gold = - cudf::left_join(g0, g1, {0}, {0}, std::vector>{}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); - } - { - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - std::vector result_decoded( - {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - - auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); - } -} - -TEST_F(JoinDictionaryTest, LeftJoinWithNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); - column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; - auto col0_2 = cudf::dictionary::encode(col0_2_w); - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - auto col1_2 = cudf::dictionary::encode(col1_2_w); - - auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); - auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); - - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded2 = cudf::dictionary::decode(result_view.column(2)); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); - - auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); - auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); - auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -} - -TEST_F(JoinDictionaryTest, InnerJoinNoNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1_w({"s1", "s1", "s0", "s4", "s0"}); - auto col0_1 = cudf::dictionary::encode(col0_1_w); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1_w({"s1", "s0", "s1", "s2", "s1"}); - auto col1_1 = cudf::dictionary::encode(col1_1_w); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; - - auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); - auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - std::vector result_decoded( - {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - - auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); - auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); -} - -TEST_F(JoinDictionaryTest, InnerJoinWithNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); - column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; - auto col0_2 = cudf::dictionary::encode(col0_2_w); - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - auto col1_2 = cudf::dictionary::encode(col1_2_w); - - auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); - auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); - - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded2 = cudf::dictionary::decode(result_view.column(2)); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); - - auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); - auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); - auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); -} - -TEST_F(JoinDictionaryTest, FullJoinNoNulls) -{ - column_wrapper col0_0{{3, 1, 2, 0, 3}}; - strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); - auto col0_1 = cudf::dictionary::encode(col0_1_w); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; - auto col1_1 = cudf::dictionary::encode(col1_1_w); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; - - auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); - auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - std::vector result_decoded( - {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - - auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); - auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -} - -TEST_F(JoinDictionaryTest, FullJoinWithNulls) -{ - column_wrapper col0_0_w{{3, 1, 2, 0, 3}}; - auto col0_0 = cudf::dictionary::encode(col0_0_w); - strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0_w{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; - auto col1_0 = cudf::dictionary::encode(col1_0_w); - strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; - column_wrapper col1_2{{1, 0, 1, 2, 1}}; - - auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2}); - auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2}); - - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded0 = cudf::dictionary::decode(result_view.column(0)); - std::vector result_decoded( - {decoded0->view(), result_view.column(1), result_view.column(2), result_view.column(3)}); - - auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); - auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); - auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -} - -TEST_F(JoinTest, InnerJoinGathermap) -{ - column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); - auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); - auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); - auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); - auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - - column_wrapper lmap_gold{{0, 2, 4}}; - column_wrapper rmap_gold{{1, 1, 4}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); -} +// TEST_F(JoinDictionaryTest, LeftJoinWithNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); +// column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; +// auto col0_2 = cudf::dictionary::encode(col0_2_w); + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); +// column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; +// auto col1_2 = cudf::dictionary::encode(col1_2_w); + +// auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); +// auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); + +// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_view = result->view(); +// auto decoded2 = cudf::dictionary::decode(result_view.column(2)); +// auto decoded3 = cudf::dictionary::decode(result_view.column(3)); +// std::vector result_decoded( +// {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); + +// auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); +// auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); +// auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); +// } -TEST_F(JoinTest, LeftJoinGathermap) -{ - column_wrapper col0_0{{3, 1, 2, 0, 3}}; - strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; +// TEST_F(JoinDictionaryTest, InnerJoinNoNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1_w({"s1", "s1", "s0", "s4", "s0"}); +// auto col0_1 = cudf::dictionary::encode(col0_1_w); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1_w({"s1", "s0", "s1", "s2", "s1"}); +// auto col1_1 = cudf::dictionary::encode(col1_1_w); +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); +// auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); + +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_view = result->view(); +// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); +// std::vector result_decoded( +// {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); + +// auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); +// auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); +// auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +// } - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; +// TEST_F(JoinDictionaryTest, InnerJoinWithNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); +// column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; +// auto col0_2 = cudf::dictionary::encode(col0_2_w); + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); +// column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; +// auto col1_2 = cudf::dictionary::encode(col1_2_w); + +// auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); +// auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); + +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_view = result->view(); +// auto decoded2 = cudf::dictionary::decode(result_view.column(2)); +// auto decoded3 = cudf::dictionary::decode(result_view.column(3)); +// std::vector result_decoded( +// {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); + +// auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); +// auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); +// auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +// } - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); +// TEST_F(JoinDictionaryTest, FullJoinNoNulls) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 3}}; +// strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); +// auto col0_1 = cudf::dictionary::encode(col0_1_w); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; +// auto col1_1 = cudf::dictionary::encode(col1_1_w); +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); +// auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); + +// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_view = result->view(); +// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); +// std::vector result_decoded( +// {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); + +// auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); +// auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); +// auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); +// } - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// TEST_F(JoinDictionaryTest, FullJoinWithNulls) +// { +// column_wrapper col0_0_w{{3, 1, 2, 0, 3}}; +// auto col0_0 = cudf::dictionary::encode(col0_0_w); +// strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0_w{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; +// auto col1_0 = cudf::dictionary::encode(col1_0_w); +// strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2}); +// auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2}); + +// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// auto result_view = result->view(); +// auto decoded0 = cudf::dictionary::decode(result_view.column(0)); +// std::vector result_decoded( +// {decoded0->view(), result_view.column(1), result_view.column(2), result_view.column(3)}); + +// auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); +// auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); +// auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); +// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); +// } - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); - auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); - auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); - auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); - auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); +// TEST_F(JoinTest, InnerJoinGathermap) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 2}}; +// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - column_wrapper lmap_gold{{0, 1, 2, 3, 4}}; - column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; +// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); +// auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); +// auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); +// auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), +// *lmap_sort_order); auto rmap_sorted = +// cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); + +// column_wrapper lmap_gold{{0, 2, 4}}; +// column_wrapper rmap_gold{{1, 1, 4}}; +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); +// } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); -} +// TEST_F(JoinTest, LeftJoinGathermap) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 3}}; +// strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}}; +// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); -TEST_F(JoinTest, FullJoinGatherMap) -{ - column_wrapper col0_0{{3, 1, 2, 0, 3}}; - strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; +// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); +// auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); +// auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); +// auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), +// *lmap_sort_order); auto rmap_sorted = +// cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - column_wrapper col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; - strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; - column_wrapper col1_2{{1, 0, 1, 2, 1}}; +// column_wrapper lmap_gold{{0, 1, 2, 3, 4}}; +// column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols0.push_back(col0_2.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - cols1.push_back(col1_2.release()); +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); +// } - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); +// TEST_F(JoinTest, FullJoinGatherMap) +// { +// column_wrapper col0_0{{3, 1, 2, 0, 3}}; +// strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); +// column_wrapper col0_2{{0, 1, 2, 4, 1}}; + +// column_wrapper col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; +// strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; +// column_wrapper col1_2{{1, 0, 1, 2, 1}}; + +// CVector cols0, cols1; +// cols0.push_back(col0_0.release()); +// cols0.push_back(col0_1.release()); +// cols0.push_back(col0_2.release()); +// cols1.push_back(col1_0.release()); +// cols1.push_back(col1_1.release()); +// cols1.push_back(col1_2.release()); + +// Table t0(std::move(cols0)); +// Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); - auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); - auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); - auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), *lmap_sort_order); - auto rmap_sorted = cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); +// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); +// auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); +// auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); +// auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), +// *lmap_sort_order); auto rmap_sorted = +// cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - column_wrapper lmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; - column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; +// column_wrapper lmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; +// column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); -} +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); +// CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); +// } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp index 30ac1b57e55..8de9610b07d 100644 --- a/cpp/tests/join/semi_join_tests.cpp +++ b/cpp/tests/join/semi_join_tests.cpp @@ -34,830 +34,3 @@ using column_wrapper = cudf::test::fixed_width_column_wrapper; struct JoinTest : public cudf::test::BaseFixture { }; - -TEST_F(JoinTest, LeftSemiJoin) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"quick", "composéd", "result", ""}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20, 20, 20}; - column_wrapper expect_1{5.0, .7, .7, .7}; - column_wrapper expect_2{90, 61, 62, 63}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftSemiJoin_with_a_string_key) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"quick", "result"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20}; - column_wrapper expect_1{5.0, .7}; - column_wrapper expect_2{90, 62}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftSemiJoin_with_null) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{"quick", "result"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20}; - column_wrapper expect_1{5.0, .7}; - column_wrapper expect_2{90, 62}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"accénted", "turtlé", "words"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{20, 20, 50}; - column_wrapper expect_1{.5, .5, .7}; - column_wrapper expect_2{77, 78, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_with_a_string_key) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"accénted", "turtlé", "composéd", "", "words"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{20, 20, 20, 20, 50}; - column_wrapper expect_1{.5, .5, .7, .7, .7}; - column_wrapper expect_2{77, 78, 61, 63, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_with_null) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{"accénted", "turtlé", "composéd", "", "words"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{20, 20, 20, 20, 50}; - column_wrapper expect_1{.5, .5, .7, .7, .7}; - column_wrapper expect_2{77, 78, 61, 63, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftSemiAntiJoin_exceptions) -{ - std::vector b_strings{"quick", "words", "result", nullptr}; - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - // - // table_a has no columns, table_b has columns - // Let's check different permutations of passing table - // with no columns to verify that exceptions are thrown - // - EXPECT_THROW(cudf::left_semi_join(table_a, table_b, {}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_a, table_b, {}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_semi_join(table_b, table_a, {}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_b, table_a, {}, {}, {}), cudf::logic_error); - - // - // table_b has columns, so we'll pass the column checks, but - // these should fail the exception check that the number of - // join columns must be the same for each table - // - EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {0}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {0}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {}, {0}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {}, {0}, {}), cudf::logic_error); -} - -TEST_F(JoinTest, LeftSemiJoin_empty_result) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{}; - column_wrapper expect_1{}; - column_wrapper expect_2{}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = - cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, std::vector{}); - - EXPECT_EQ(join_table->num_columns(), 0); - EXPECT_EQ(join_table->num_rows(), 0); - - auto join_table2 = cudf::left_semi_join(table_a, table_b, {}, {}, {0, 1, 3}); - - EXPECT_EQ(join_table2->num_columns(), 3); - EXPECT_EQ(join_table2->num_rows(), 0); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_empty_result) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{}; - column_wrapper expect_1{}; - column_wrapper expect_2{}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = - cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, std::vector{}); - - EXPECT_EQ(join_table->num_columns(), 0); - EXPECT_EQ(join_table->num_rows(), 0); - - auto join_table2 = cudf::left_anti_join(table_a, table_b, {}, {}, {0, 1, 3}); - - EXPECT_EQ(join_table2->num_columns(), 3); - EXPECT_EQ(join_table2->num_rows(), 0); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3); -} - -TEST_F(JoinTest, LeftSemiAntiJoin_empty_table) -{ - std::vector a_strings{}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{}; - - column_wrapper a_0{}; - column_wrapper a_1{}; - column_wrapper a_2{}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{}; - column_wrapper expect_1{}; - column_wrapper expect_2{}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table->get_column(3), expect_3); - - auto join_table2 = cudf::left_semi_join(table_b, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(3), expect_3); - - auto join_table3 = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table3->get_column(3), expect_3); - - auto join_table4 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table4->get_column(3), expect_3); - - auto join_table5 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table5->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_empty_right_table) -{ - std::vector a_strings{"quick", "words", "result", nullptr}; - std::vector b_strings{}; - std::vector e_strings{"quick", "words", "result", nullptr}; - - column_wrapper a_0{10, 20, 20, 50}; - column_wrapper a_1{5.0, .7, .7, .7}; - column_wrapper a_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{}; - column_wrapper b_1{}; - column_wrapper b_2{}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20, 20, 50}; - column_wrapper expect_1{5.0, .7, .7, .7}; - column_wrapper expect_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -struct JoinDictionaryTest : public cudf::test::BaseFixture { -}; - -TEST_F(JoinDictionaryTest, LeftSemiJoin) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - { - auto result = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected); - } - { - auto result = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); - } -} - -TEST_F(JoinDictionaryTest, LeftSemiJoinWithNulls) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - - auto result = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); -} - -TEST_F(JoinDictionaryTest, LeftAntiJoin) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - { - auto result = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected); - } - { - auto result = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); - } -} - -TEST_F(JoinDictionaryTest, LeftAntiJoinWithNulls) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - - auto result = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); -} - -TEST_F(JoinTest, LeftSemiSimple) -{ - column_wrapper a_0{1, 9, 0}; - column_wrapper a_1{1, 2, 3}; - auto table_a = cudf::table_view({a_0, a_1}); - - column_wrapper b_0{0, 1}; - column_wrapper b_1{1, 2}; - auto table_b = cudf::table_view({b_0, b_1}); - - auto result = cudf::left_anti_join(table_a, table_b, {0}, {0}, {0, 1}); - auto result_view = result->view(); - - column_wrapper expect_0{9}; - column_wrapper expect_1{2}; - auto expect = cudf::table_view({expect_0, expect_1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expect); -} From d736d1c9298e97fd38086c879d4e60e5473a6365 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 18 Feb 2021 08:30:23 -0500 Subject: [PATCH 043/138] More join tests --- cpp/tests/join/join_tests.cpp | 179 ++++++++++++++++++---------------- 1 file changed, 95 insertions(+), 84 deletions(-) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index ec15e5b03c7..b8af44d2083 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -371,7 +371,7 @@ TEST_F(JoinTest, LeftJoinWithNulls) column_wrapper col0_2{{0, 1, 2, 4, 1}}; column_wrapper col1_0{{2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, ); + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; CVector cols0, cols1; @@ -392,9 +392,9 @@ TEST_F(JoinTest, LeftJoinWithNulls) column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1}); column_wrapper col_gold_2{{0, 1, 2, 4, 1}, {1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{3, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; - strcol_wrapper col_gold_4{{"s1", "", "", "", ""}, {1, 0, 0, 0, 0}}; - column_wrapper col_gold_5{{1, 1, -1, 1, 1}, {1, 0, 1, 1, 1}}; + column_wrapper col_gold_3{{3, -1, -1, -1, 2}, {1, 0, 0, 0, 1}}; + strcol_wrapper col_gold_4{{"s1", "", "", "", "s0"}, {1, 0, 0, 0, 1}}; + column_wrapper col_gold_5{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); @@ -407,103 +407,114 @@ TEST_F(JoinTest, LeftJoinWithNulls) auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } -// TEST_F(JoinTest, LeftJoinOnNulls) -// { -// // clang-format off -// column_wrapper col0_0{{ 3, 1, 2}, -// { 1, 0, 1}}; -// strcol_wrapper col0_1({"s0", "s1", "s2" }); -n// column_wrapper col0_2{{ 0, 1, 2 }}; +TEST_F(JoinTest, LeftJoinOnNulls) +{ + // clang-format off + column_wrapper col0_0{{ 3, 1, 2}, + { 1, 0, 1}}; + strcol_wrapper col0_1({"s0", "s1", "s2" }); + column_wrapper col0_2{{ 0, 1, 2 }}; -// column_wrapper col1_0{{ 2, 5, 3, 7 }, -// { 1, 1, 1, 0 }}; -// strcol_wrapper col1_1({"s1", "s0", "s0", "s1" }); -// column_wrapper col1_2{{ 1, 4, 2, 8 }}; + column_wrapper col1_0{{ 2, 5, 3, 7 }, + { 1, 1, 1, 0 }}; + strcol_wrapper col1_1({"s1", "s0", "s0", "s1" }); + column_wrapper col1_2{{ 1, 4, 2, 8 }}; -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); -// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); -// #if 0 -// std::cout << "Actual Results:\n"; -// cudf::test::print(sorted_result->get_column(0).view(), std::cout, ",\t\t"); -// cudf::test::print(sorted_result->get_column(1).view(), std::cout, ",\t\t"); -// cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); -// cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); -// #endif - -// column_wrapper col_gold_0{{ 3, -1, 2}, -// { 1, 0, 1}}; -// strcol_wrapper col_gold_1({ "s0", "s1", "s2"}, -// { 1, 1, 1}); -// column_wrapper col_gold_2{{ 0, 1, 2}, -// { 1, 1, 1}}; -// column_wrapper col_gold_3{{ 2, 8, -1}, -// { 1, 1, 0}}; +#if 0 + std::cout << "Actual Results:\n"; + cudf::test::print(sorted_result->get_column(0).view(), std::cout, ",\t\t"); + cudf::test::print(sorted_result->get_column(1).view(), std::cout, ",\t\t"); + cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); + cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); +#endif -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// cols_gold.push_back(col_gold_2.release()); -// cols_gold.push_back(col_gold_3.release()); -// Table gold(std::move(cols_gold)); + column_wrapper col_gold_0{{ 3, -1, 2}, + { 1, 0, 1}}; + strcol_wrapper col_gold_1({ "s0", "s1", "s2"}, + { 1, 1, 1}); + column_wrapper col_gold_2{{ 0, 1, 2}, + { 1, 1, 1}}; + column_wrapper col_gold_3{{ 3, -1, -1}, + { 1, 0, 0}}; + strcol_wrapper col_gold_4({ "s0", "s1", ""}, + { 1, 1, 0}); + column_wrapper col_gold_5{{ 2, 8, -1}, + { 1, 1, 0}}; + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// #if 0 -// std::cout << "Expected Results:\n"; -// cudf::test::print(sorted_gold->get_column(0).view(), std::cout, ",\t\t"); -// cudf::test::print(sorted_gold->get_column(1).view(), std::cout, ",\t\t"); -// cudf::test::print(sorted_gold->get_column(2).view(), std::cout, ",\t\t"); -// cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); -// #endif +#if 0 + std::cout << "Expected Results:\n"; + cudf::test::print(sorted_gold->get_column(0).view(), std::cout, ",\t\t"); + cudf::test::print(sorted_gold->get_column(1).view(), std::cout, ",\t\t"); + cudf::test::print(sorted_gold->get_column(2).view(), std::cout, ",\t\t"); + cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); +#endif -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); -// // Repeat test with compare_nulls_equal=false, -// // as per SQL standard. + // Repeat test with compare_nulls_equal=false, + // as per SQL standard. -// result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, -// cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); -// sorted_result = cudf::gather(result->view(), *result_sort_order); + result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL); + result_sort_order = cudf::sorted_order(result->view()); + sorted_result = cudf::gather(result->view(), *result_sort_order); -// col_gold_0 = {{ 3, -1, 2}, -// { 1, 0, 1}}; -// col_gold_1 = strcol_wrapper({ "s0", "s1", "s2"}, -// { 1, 1, 1}); -// col_gold_2 = {{ 0, 1, 2}, -// { 1, 1, 1}}; -// col_gold_3 = {{ 2, -1, -1}, -// { 1, 0, 0}}; + + col_gold_0 = {{ 3, -1, 2}, + { 1, 0, 1}}; + col_gold_1 = {{ "s0", "s1", "s2"}, + { 1, 1, 1}}; + col_gold_2 = {{ 0, 1, 2}, + { 1, 1, 1}}; + col_gold_3 = {{ 3, -1, -1}, + { 1, 0, 0}}; + col_gold_4 = {{ "s0", "", ""}, + { 1, 0, 0}}; + col_gold_5 = {{ 2, -1, -1}, + { 1, 0, 0}}; -// // clang-format on -// CVector cols_gold_nulls_unequal; -// cols_gold_nulls_unequal.push_back(col_gold_0.release()); -// cols_gold_nulls_unequal.push_back(col_gold_1.release()); -// cols_gold_nulls_unequal.push_back(col_gold_2.release()); -// cols_gold_nulls_unequal.push_back(col_gold_3.release()); -// Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)}; + // clang-format on + CVector cols_gold_nulls_unequal; + cols_gold_nulls_unequal.push_back(col_gold_0.release()); + cols_gold_nulls_unequal.push_back(col_gold_1.release()); + cols_gold_nulls_unequal.push_back(col_gold_2.release()); + cols_gold_nulls_unequal.push_back(col_gold_3.release()); + Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)}; -// gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); -// sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); + gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); + sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} // TEST_F(JoinTest, InnerJoinSizeOverflow) // { From b58591dce203c752d62441e3d75a3fa42ddc6e00 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 18 Feb 2021 12:24:15 -0500 Subject: [PATCH 044/138] Fix all join tests --- cpp/tests/join/join_tests.cpp | 1774 +++++++++++++++------------------ 1 file changed, 786 insertions(+), 988 deletions(-) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index b8af44d2083..fbde179d33d 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -110,7 +110,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinNoNulls) @@ -159,7 +159,7 @@ TEST_F(JoinTest, FullJoinNoNulls) auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinWithNulls) @@ -208,7 +208,7 @@ TEST_F(JoinTest, FullJoinWithNulls) auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinOnNulls) @@ -516,1030 +516,828 @@ TEST_F(JoinTest, LeftJoinOnNulls) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } -// TEST_F(JoinTest, InnerJoinSizeOverflow) -// { -// auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); -// zero->set_valid(true); -// static_cast *>(zero.get())->set_value(0); - -// // Should cause size overflow, raise exception -// int32_t left = 4; -// int32_t right = 1073741825; - -// auto col0_0 = cudf::make_column_from_scalar(*zero, left); -// auto col1_0 = cudf::make_column_from_scalar(*zero, right); - -// CVector cols0, cols1; -// cols0.push_back(std::move(col0_0)); -// cols1.push_back(std::move(col1_0)); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}), cudf::logic_error); -// } - -// TEST_F(JoinTest, InnerJoinNoNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// column_wrapper col_gold_0{{3, 2, 2}}; -// strcol_wrapper col_gold_1({"s1", "s0", "s0"}); -// column_wrapper col_gold_2{{0, 2, 1}}; -// column_wrapper col_gold_3{{1, 0, 0}}; -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// cols_gold.push_back(col_gold_2.release()); -// cols_gold.push_back(col_gold_3.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } - -// TEST_F(JoinTest, InnerJoinNonAlignedCommon) -// { -// CVector cols0, cols1; -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); -// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); -// cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); -// cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1"}).release()); -// cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// CVector cols_gold; -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); -// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); -// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } - -// TEST_F(JoinTest, InnerJoinNonAlignedCommonSwap) -// { -// CVector cols0, cols1; -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); -// cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); -// cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 5}}.release()); -// cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0"}).release()); -// cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1, 0}}.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// CVector cols_gold; -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); -// cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); -// cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } - -// TEST_F(JoinTest, InnerJoinWithNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); -// column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// column_wrapper col_gold_0{{3, 2}}; -// strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1}); -// column_wrapper col_gold_2{{0, 1}}; -// column_wrapper col_gold_3{{1, -1}, {1, 0}}; -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// cols_gold.push_back(col_gold_2.release()); -// cols_gold.push_back(col_gold_3.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } +TEST_F(JoinTest, InnerJoinSizeOverflow) +{ + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); + zero->set_valid(true); + static_cast *>(zero.get())->set_value(0); + + // Should cause size overflow, raise exception + int32_t left = 4; + int32_t right = 1073741825; + + auto col0_0 = cudf::make_column_from_scalar(*zero, left); + auto col1_0 = cudf::make_column_from_scalar(*zero, right); + + CVector cols0, cols1; + cols0.push_back(std::move(col0_0)); + cols1.push_back(std::move(col1_0)); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}), cudf::logic_error); +} + +TEST_F(JoinTest, InnerJoinNoNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{3, 2, 2}}; + strcol_wrapper col_gold_1({"s1", "s0", "s0"}); + column_wrapper col_gold_2{{0, 2, 1}}; + column_wrapper col_gold_3{{3, 2, 2}}; + strcol_wrapper col_gold_4({"s1", "s0", "s0"}); + column_wrapper col_gold_5{{1, 0, 0}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} + +TEST_F(JoinTest, InnerJoinWithNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{3, 2}}; + strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1}); + column_wrapper col_gold_2{{0, 1}}; + column_wrapper col_gold_3{{3, 2}}; + strcol_wrapper col_gold_4({"s1", "s0"}, {1, 1}); + column_wrapper col_gold_5{{1, -1}, {1, 0}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} // // Test to check join behaviour when join keys are null. -// TEST_F(JoinTest, InnerJoinOnNulls) -// { -// // clang-format off -// column_wrapper col0_0{{ 3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1({"s1", "s1", "s8", "s4", "s0"}, -// { 1, 1, 0, 1, 1}); -// column_wrapper col0_2{{ 0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{ 2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, -// { 1, 0, 1, 1, 1}); -// column_wrapper col1_2{{ 1, 0, 1, 2, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// column_wrapper col_gold_0 {{ 3, 2}}; -// strcol_wrapper col_gold_1 ({"s1", "s0"}, -// { 1, 0}); -// column_wrapper col_gold_2{{ 0, 2}}; -// column_wrapper col_gold_3{{ 1, 0}}; -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// cols_gold.push_back(col_gold_2.release()); -// cols_gold.push_back(col_gold_3.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - -// // Repeat test with compare_nulls_equal=false, -// // as per SQL standard. - -// result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, -// cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); -// sorted_result = cudf::gather(result->view(), *result_sort_order); - -// col_gold_0 = {{ 3}}; -// col_gold_1 = strcol_wrapper({"s1"}, -// { 1}); -// col_gold_2 = {{ 0}}; -// col_gold_3 = {{ 1}}; - -// // clang-format on - -// CVector cols_gold_sql; -// cols_gold_sql.push_back(col_gold_0.release()); -// cols_gold_sql.push_back(col_gold_1.release()); -// cols_gold_sql.push_back(col_gold_2.release()); -// cols_gold_sql.push_back(col_gold_3.release()); -// Table gold_sql(std::move(cols_gold_sql)); - -// gold_sort_order = cudf::sorted_order(gold_sql.view()); -// sorted_gold = cudf::gather(gold_sql.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } - -// // Empty Left Table -// TEST_F(JoinTest, EmptyLeftTableInnerJoin) -// { -// column_wrapper col0_0; -// column_wrapper col0_1; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); - -// Table empty0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); -// } - -// TEST_F(JoinTest, EmptyLeftTableLeftJoin) -// { -// column_wrapper col0_0; -// column_wrapper col0_1; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); - -// Table empty0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); -// } - -// TEST_F(JoinTest, EmptyLeftTableLeftJoinNonAlignedCommon) -// { -// column_wrapper col0_0; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// CVector cols0, cols1; -// cols0.emplace_back(col0_0.release()); -// cols1.emplace_back(col1_0.release()); -// cols1.emplace_back(col1_1.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// column_wrapper col_gold_0; -// column_wrapper col_gold_1; - -// CVector cols_gold; -// cols_gold.emplace_back(col_gold_0.release()); -// cols_gold.emplace_back(col_gold_1.release()); - -// Table gold(std::move(cols_gold)); - -// auto result = cudf::left_join(t0, t1, {0}, {1}, {{0, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -// } +TEST_F(JoinTest, InnerJoinOnNulls) +{ + // clang-format off + column_wrapper col0_0{{ 3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s8", "s4", "s0"}, + { 1, 1, 0, 1, 1}); + column_wrapper col0_2{{ 0, 1, 2, 4, 1}}; -// TEST_F(JoinTest, EmptyLeftTableFullJoin) -// { -// column_wrapper col0_0; -// column_wrapper col0_1; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); + column_wrapper col1_0{{ 2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, + { 1, 0, 1, 1, 1}); + column_wrapper col1_2{{ 1, 0, 1, 2, 1}}; -// Table empty0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::full_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(t1, *result); -// } - -// // Empty Right Table -// TEST_F(JoinTest, EmptyRightTableInnerJoin) -// { -// column_wrapper col0_0{{2, 2, 0, 4, 3}}; -// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// column_wrapper col1_0; -// column_wrapper col1_1; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); -// Table t0(std::move(cols0)); -// Table empty1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -// } + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); -// TEST_F(JoinTest, EmptyRightTableInnerJoinNonAlignedCommon) -// { -// column_wrapper col0_0{{2, 2, 0, 4, 3}}; -// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// column_wrapper col1_0; + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); -// CVector cols0, cols1; -// cols0.emplace_back(col0_0.release()); -// cols0.emplace_back(col0_1.release()); -// cols1.emplace_back(col1_0.release()); + column_wrapper col_gold_0 {{ 3, 2}}; + strcol_wrapper col_gold_1 ({"s1", "s0"}, + { 1, 0}); + column_wrapper col_gold_2{{ 0, 2}}; + column_wrapper col_gold_3 {{ 3, 2}}; + strcol_wrapper col_gold_4 ({"s1", "s0"}, + { 1, 0}); + column_wrapper col_gold_5{{ 1, 0}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + + Table gold(std::move(cols_gold)); -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); -// column_wrapper col_gold_0; -// column_wrapper col_gold_1; + // Repeat test with compare_nulls_equal=false, + // as per SQL standard. -// CVector cols_gold; -// cols_gold.emplace_back(col_gold_0.release()); -// cols_gold.emplace_back(col_gold_1.release()); + result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL); + result_sort_order = cudf::sorted_order(result->view()); + sorted_result = cudf::gather(result->view(), *result_sort_order); -// Table gold(std::move(cols_gold)); + col_gold_0 = {{ 3}}; + col_gold_1 = strcol_wrapper({"s1"}, + { 1}); + col_gold_2 = {{ 0}}; + col_gold_3 = {{ 3}}; + col_gold_4 = strcol_wrapper({"s1"}, + { 1}); + col_gold_5 = {{ 1}}; -// auto result = cudf::inner_join(t0, t1, {1}, {0}, {{1, 0}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -// } + // clang-format on -// TEST_F(JoinTest, EmptyRightTableLeftJoin) -// { -// column_wrapper col0_0{{2, 2, 0, 4, 3}, {1, 1, 1, 1, 1}}; -// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + CVector cols_gold_sql; + cols_gold_sql.push_back(col_gold_0.release()); + cols_gold_sql.push_back(col_gold_1.release()); + cols_gold_sql.push_back(col_gold_2.release()); + cols_gold_sql.push_back(col_gold_3.release()); + cols_gold_sql.push_back(col_gold_4.release()); + cols_gold_sql.push_back(col_gold_5.release()); + Table gold_sql(std::move(cols_gold_sql)); + + gold_sort_order = cudf::sorted_order(gold_sql.view()); + sorted_gold = cudf::gather(gold_sql.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} -// column_wrapper col1_0; -// column_wrapper col1_1; +// Empty Left Table +TEST_F(JoinTest, EmptyLeftTableInnerJoin) +{ + column_wrapper col0_0; + column_wrapper col0_1; -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -// Table t0(std::move(cols0)); -// Table empty1(std::move(cols1)); + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); -// auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); -// } + Table empty0(std::move(cols0)); + Table t1(std::move(cols1)); -// TEST_F(JoinTest, EmptyRightTableFullJoin) -// { -// column_wrapper col0_0{{2, 2, 0, 4, 3}}; -// column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - -// column_wrapper col1_0; -// column_wrapper col1_1; + auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result); +} -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); - -// Table t0(std::move(cols0)); -// Table empty1(std::move(cols1)); - -// auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); -// } +TEST_F(JoinTest, EmptyLeftTableLeftJoin) +{ + column_wrapper col0_0; + column_wrapper col0_1; -// // Both tables empty -// TEST_F(JoinTest, BothEmptyInnerJoin) -// { -// column_wrapper col0_0; -// column_wrapper col0_1; + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -// column_wrapper col1_0; -// column_wrapper col1_1; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); -// Table t0(std::move(cols0)); -// Table empty1(std::move(cols1)); + Table empty0(std::move(cols0)); + Table t1(std::move(cols1)); -// auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -// } + auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result); +} -// TEST_F(JoinTest, BothEmptyLeftJoin) -// { -// column_wrapper col0_0; -// column_wrapper col0_1; +TEST_F(JoinTest, EmptyLeftTableFullJoin) +{ + column_wrapper col0_0; + column_wrapper col0_1; -// column_wrapper col1_0; -// column_wrapper col1_1; + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); -// Table t0(std::move(cols0)); -// Table empty1(std::move(cols1)); + Table lhs(std::move(cols0)); + Table rhs(std::move(cols1)); -// auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -// } + auto result = cudf::full_join(lhs, rhs, {0, 1}, {0, 1}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); -// TEST_F(JoinTest, BothEmptyFullJoin) -// { -// column_wrapper col0_0; -// column_wrapper col0_1; + column_wrapper col_gold_0{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}}; + column_wrapper col_gold_1{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}}; + column_wrapper col_gold_2{{2, 2, 0, 4, 3}}; + column_wrapper col_gold_3{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -// column_wrapper col1_0; -// column_wrapper col1_1; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); - -// Table t0(std::move(cols0)); -// Table empty1(std::move(cols1)); - -// auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -// } + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} + +// Empty Right Table +TEST_F(JoinTest, EmptyRightTableInnerJoin) +{ + column_wrapper col0_0{{2, 2, 0, 4, 3}}; + column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + + column_wrapper col1_0; + column_wrapper col1_1; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table empty1(std::move(cols1)); + + auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); +} + +TEST_F(JoinTest, EmptyRightTableLeftJoin) +{ + column_wrapper col0_0{{2, 2, 0, 4, 3}, {1, 1, 1, 1, 1}}; + column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + + column_wrapper col1_0; + column_wrapper col1_1; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table empty1(std::move(cols1)); + + auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result); +} + +TEST_F(JoinTest, EmptyRightTableFullJoin) +{ + column_wrapper col0_0{{2, 2, 0, 4, 3}}; + column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + + column_wrapper col1_0; + column_wrapper col1_1; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table empty1(std::move(cols1)); + + auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result); +} + +// Both tables empty +TEST_F(JoinTest, BothEmptyInnerJoin) +{ + column_wrapper col0_0; + column_wrapper col0_1; + + column_wrapper col1_0; + column_wrapper col1_1; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table empty1(std::move(cols1)); + + auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); +} + +TEST_F(JoinTest, BothEmptyLeftJoin) +{ + column_wrapper col0_0; + column_wrapper col0_1; + + column_wrapper col1_0; + column_wrapper col1_1; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table empty1(std::move(cols1)); + + auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); +} + +TEST_F(JoinTest, BothEmptyFullJoin) +{ + column_wrapper col0_0; + column_wrapper col0_1; + + column_wrapper col1_0; + column_wrapper col1_1; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table empty1(std::move(cols1)); + + auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); +} // // EqualValues X Inner,Left,Full -// TEST_F(JoinTest, EqualValuesInnerJoin) -// { -// column_wrapper col0_0{{0, 0}}; -// strcol_wrapper col0_1({"s0", "s0"}); +TEST_F(JoinTest, EqualValuesInnerJoin) +{ + column_wrapper col0_0{{0, 0}}; + strcol_wrapper col0_1({"s0", "s0"}); + + column_wrapper col1_0{{0, 0}}; + strcol_wrapper col1_1({"s0", "s0"}); + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + + column_wrapper col_gold_0{{0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); + column_wrapper col_gold_2{{0, 0, 0, 0}}; + strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}); + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + + Table gold(std::move(cols_gold)); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result); +} + +TEST_F(JoinTest, EqualValuesLeftJoin) +{ + column_wrapper col0_0{{0, 0}}; + strcol_wrapper col0_1({"s0", "s0"}); + + column_wrapper col1_0{{0, 0}}; + strcol_wrapper col1_1({"s0", "s0"}); + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); + + column_wrapper col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}}; + strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); + column_wrapper col_gold_2{{0, 0, 0, 0}, {1, 1, 1, 1}}; + strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result); +} + +TEST_F(JoinTest, EqualValuesFullJoin) +{ + column_wrapper col0_0{{0, 0}}; + strcol_wrapper col0_1({"s0", "s0"}); + + column_wrapper col1_0{{0, 0}}; + strcol_wrapper col1_1({"s0", "s0"}); + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); + + column_wrapper col_gold_0{{0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); + column_wrapper col_gold_2{{0, 0, 0, 0}}; + strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}); + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result); +} + +TEST_F(JoinTest, InnerJoinCornerCase) +{ + column_wrapper col0_0{{4, 1, 3, 2, 2, 2, 2}}; + column_wrapper col1_0{{2}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols1.push_back(col1_0.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::inner_join(t0, t1, {0}, {0}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{2, 2, 2, 2}}; + column_wrapper col_gold_1{{2, 2, 2, 2}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} + +TEST_F(JoinTest, HashJoinSequentialProbes) +{ + CVector cols1; + cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); + cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); + + Table t1(std::move(cols1)); + + cudf::hash_join hash_join(t1, cudf::null_equality::EQUAL); + + { + CVector cols0; + cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); + cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); + + Table t0(std::move(cols0)); + + auto result = hash_join.full_join(t0); + auto result_table = cudf::table_view({result.first->view(), result.second->view()}); + auto result_sort_order = cudf::sorted_order(result_table); + auto sorted_result = cudf::gather(result_table, *result_sort_order); + + column_wrapper col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}}; + column_wrapper col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}}; + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + + Table gold(std::move(cols_gold)); + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); + } -// column_wrapper col1_0{{0, 0}}; -// strcol_wrapper col1_1({"s0", "s0"}); + { + CVector cols0; + cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); + cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); + Table t0(std::move(cols0)); -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - -// column_wrapper col_gold_0{{0, 0, 0, 0}}; -// strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// Table gold(std::move(cols_gold)); + auto result = hash_join.left_join(t0); + auto result_table = cudf::table_view({result.first->view(), result.second->view()}); + auto result_sort_order = cudf::sorted_order(result_table); + auto sorted_result = cudf::gather(result_table, *result_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -// } - -// TEST_F(JoinTest, EqualValuesLeftJoin) -// { -// column_wrapper col0_0{{0, 0}}; -// strcol_wrapper col0_1({"s0", "s0"}); + column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; + column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; -// column_wrapper col1_0{{0, 0}}; -// strcol_wrapper col1_1({"s0", "s0"}); + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + + Table gold(std::move(cols_gold)); + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); + } + + { + CVector cols0; + cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); + cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); + + Table t0(std::move(cols0)); + + auto result = hash_join.inner_join(t0); + auto result_table = cudf::table_view({result.first->view(), result.second->view()}); + auto result_sort_order = cudf::sorted_order(result_table); + auto sorted_result = cudf::gather(result_table, *result_sort_order); + + column_wrapper col_gold_0{{2, 4, 0}}; + column_wrapper col_gold_1{{1, 1, 4}}; + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + + Table gold(std::move(cols_gold)); + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); + } +} + +struct JoinDictionaryTest : public cudf::test::BaseFixture { +}; -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - -// column_wrapper col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}}; -// strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// Table gold(std::move(cols_gold)); - -// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -// } - -// TEST_F(JoinTest, EqualValuesFullJoin) -// { -// column_wrapper col0_0{{0, 0}}; -// strcol_wrapper col0_1({"s0", "s0"}); - -// column_wrapper col1_0{{0, 0}}; -// strcol_wrapper col1_1({"s0", "s0"}); - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - -// column_wrapper col_gold_0{{0, 0, 0, 0}}; -// strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// cols_gold.push_back(col_gold_1.release()); -// Table gold(std::move(cols_gold)); - -// CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); -// } - -// TEST_F(JoinTest, InnerJoinCornerCase) -// { -// column_wrapper col0_0{{4, 1, 3, 2, 2, 2, 2}}; -// column_wrapper col1_0{{2}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols1.push_back(col1_0.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}); -// auto result_sort_order = cudf::sorted_order(result->view()); -// auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// column_wrapper col_gold_0{{2, 2, 2, 2}}; -// CVector cols_gold; -// cols_gold.push_back(col_gold_0.release()); -// Table gold(std::move(cols_gold)); - -// auto gold_sort_order = cudf::sorted_order(gold.view()); -// auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// } - -// // TEST_F(JoinTest, HashJoinSequentialProbes) -// // { -// // CVector cols1; -// // cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); -// // cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); -// // cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); - -// // Table t1(std::move(cols1)); - -// // cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL); - -// // { -// // CVector cols0; -// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); -// // cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); -// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// // Table t0(std::move(cols0)); - -// // auto result = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}}); -// // auto result_sort_order = cudf::sorted_order(result->view()); -// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// // CVector cols_gold; -// // cols_gold.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release()); -// // cols_gold.emplace_back( -// // strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release()); -// // cols_gold.emplace_back( -// // column_wrapper{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}} -// // .release()); -// // cols_gold.emplace_back( -// // column_wrapper{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}} -// // .release()); -// // Table gold(std::move(cols_gold)); - -// // auto gold_sort_order = cudf::sorted_order(gold.view()); -// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// // } - -// // { -// // CVector cols0; -// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); -// // cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); -// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// // Table t0(std::move(cols0)); - -// // auto result = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}}); -// // auto result_sort_order = cudf::sorted_order(result->view()); -// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// // CVector cols_gold; -// // cols_gold.emplace_back(column_wrapper{{3, 3, 1, 2, 0}, {1, 1, 1, 1, -// 1}}.release()); -// // cols_gold.emplace_back( -// // strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release()); -// // cols_gold.emplace_back(column_wrapper{{1, 0, 1, 2, 4}, {1, 1, 1, 1, -// 1}}.release()); -// // cols_gold.emplace_back(column_wrapper{{1, -1, -1, -1, -1}, {1, 0, 0, 0, -// // 0}}.release()); Table gold(std::move(cols_gold)); - -// // auto gold_sort_order = cudf::sorted_order(gold.view()); -// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// // } - -// // { -// // CVector cols0; -// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// // cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); -// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// // Table t0(std::move(cols0)); - -// // auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}}); -// // auto joined_cols = probe_build_pair.first->release(); -// // auto build_cols = probe_build_pair.second->release(); -// // joined_cols.insert(joined_cols.end(), -// // std::make_move_iterator(build_cols.begin()), -// // std::make_move_iterator(build_cols.end())); -// // auto result = std::make_unique(std::move(joined_cols)); -// // auto result_sort_order = cudf::sorted_order(result->view()); -// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// // CVector cols_gold; -// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// // cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); -// // cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); -// // cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); -// // Table gold(std::move(cols_gold)); - -// // auto gold_sort_order = cudf::sorted_order(gold.view()); -// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// // } - -// // { -// // CVector cols0; -// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// // cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); -// // cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); -// // cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - -// // Table t0(std::move(cols0)); - -// // auto probe_build_pair = hash_join.inner_join( -// // t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD); -// // auto joined_cols = probe_build_pair.second->release(); -// // auto probe_cols = probe_build_pair.first->release(); -// // joined_cols.insert(joined_cols.end(), -// // std::make_move_iterator(probe_cols.begin()), -// // std::make_move_iterator(probe_cols.end())); -// // auto result = std::make_unique(std::move(joined_cols)); -// // auto result_sort_order = cudf::sorted_order(result->view()); -// // auto sorted_result = cudf::gather(result->view(), *result_sort_order); - -// // CVector cols_gold; -// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// // cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); -// // cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); -// // cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); -// // cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); -// // Table gold(std::move(cols_gold)); - -// // auto gold_sort_order = cudf::sorted_order(gold.view()); -// // auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); -// // CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -// // } -// // } - -// struct JoinDictionaryTest : public cudf::test::BaseFixture { -// }; - -// TEST_F(JoinDictionaryTest, LeftJoinNoNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 3}}; -// strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); -// auto col0_1 = cudf::dictionary::encode(col0_1_w); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; -// auto col1_1 = cudf::dictionary::encode(col1_1_w); -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); -// auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); -// auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); -// auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); -// { -// auto result = -// cudf::left_join(t0, t1, {0}, {0}, std::vector>{}); -// auto result_view = result->view(); -// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); -// auto decoded4 = cudf::dictionary::decode(result_view.column(4)); -// std::vector result_decoded({result_view.column(0), -// decoded1->view(), -// result_view.column(2), -// result_view.column(3), -// decoded4->view(), -// result_view.column(5)}); - -// auto gold = -// cudf::left_join(g0, g1, {0}, {0}, std::vector>{}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -// } -// { -// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_view = result->view(); -// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); -// std::vector result_decoded( -// {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - -// auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -// } -// } - -// TEST_F(JoinDictionaryTest, LeftJoinWithNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); -// column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; -// auto col0_2 = cudf::dictionary::encode(col0_2_w); - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); -// column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -// auto col1_2 = cudf::dictionary::encode(col1_2_w); - -// auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); -// auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); - -// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_view = result->view(); -// auto decoded2 = cudf::dictionary::decode(result_view.column(2)); -// auto decoded3 = cudf::dictionary::decode(result_view.column(3)); -// std::vector result_decoded( -// {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); - -// auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); -// auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); -// auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -// } - -// TEST_F(JoinDictionaryTest, InnerJoinNoNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1_w({"s1", "s1", "s0", "s4", "s0"}); -// auto col0_1 = cudf::dictionary::encode(col0_1_w); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1_w({"s1", "s0", "s1", "s2", "s1"}); -// auto col1_1 = cudf::dictionary::encode(col1_1_w); -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); -// auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_view = result->view(); -// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); -// std::vector result_decoded( -// {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - -// auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); -// auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); -// auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); -// } - -// TEST_F(JoinDictionaryTest, InnerJoinWithNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); -// column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; -// auto col0_2 = cudf::dictionary::encode(col0_2_w); - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); -// column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; -// auto col1_2 = cudf::dictionary::encode(col1_2_w); - -// auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); -// auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_view = result->view(); -// auto decoded2 = cudf::dictionary::decode(result_view.column(2)); -// auto decoded3 = cudf::dictionary::decode(result_view.column(3)); -// std::vector result_decoded( -// {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); - -// auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); -// auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); -// auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); -// } - -// TEST_F(JoinDictionaryTest, FullJoinNoNulls) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 3}}; -// strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); -// auto col0_1 = cudf::dictionary::encode(col0_1_w); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; -// auto col1_1 = cudf::dictionary::encode(col1_1_w); -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); -// auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - -// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_view = result->view(); -// auto decoded1 = cudf::dictionary::decode(result_view.column(1)); -// std::vector result_decoded( -// {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - -// auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); -// auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); -// auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -// } - -// TEST_F(JoinDictionaryTest, FullJoinWithNulls) -// { -// column_wrapper col0_0_w{{3, 1, 2, 0, 3}}; -// auto col0_0 = cudf::dictionary::encode(col0_0_w); -// strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0_w{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; -// auto col1_0 = cudf::dictionary::encode(col1_0_w); -// strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2}); -// auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2}); - -// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// auto result_view = result->view(); -// auto decoded0 = cudf::dictionary::decode(result_view.column(0)); -// std::vector result_decoded( -// {decoded0->view(), result_view.column(1), result_view.column(2), result_view.column(3)}); - -// auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); -// auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); -// auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); -// CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); -// } - -// TEST_F(JoinTest, InnerJoinGathermap) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 2}}; -// strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); -// auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); -// auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); -// auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), -// *lmap_sort_order); auto rmap_sorted = -// cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - -// column_wrapper lmap_gold{{0, 2, 4}}; -// column_wrapper rmap_gold{{1, 1, 4}}; -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); -// } - -// TEST_F(JoinTest, LeftJoinGathermap) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 3}}; -// strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}}; -// strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); -// auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); -// auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); -// auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), -// *lmap_sort_order); auto rmap_sorted = -// cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - -// column_wrapper lmap_gold{{0, 1, 2, 3, 4}}; -// column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; - -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); -// } - -// TEST_F(JoinTest, FullJoinGatherMap) -// { -// column_wrapper col0_0{{3, 1, 2, 0, 3}}; -// strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); -// column_wrapper col0_2{{0, 1, 2, 4, 1}}; - -// column_wrapper col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; -// strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; -// column_wrapper col1_2{{1, 0, 1, 2, 1}}; - -// CVector cols0, cols1; -// cols0.push_back(col0_0.release()); -// cols0.push_back(col0_1.release()); -// cols0.push_back(col0_2.release()); -// cols1.push_back(col1_0.release()); -// cols1.push_back(col1_1.release()); -// cols1.push_back(col1_2.release()); - -// Table t0(std::move(cols0)); -// Table t1(std::move(cols1)); - -// auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); -// auto lmap_sort_order = cudf::sorted_order(cudf::table_view({result.first->view()})); -// auto rmap_sort_order = cudf::sorted_order(cudf::table_view({result.second->view()})); -// auto lmap_sorted = cudf::gather(cudf::table_view({result.first->view()}), -// *lmap_sort_order); auto rmap_sorted = -// cudf::gather(cudf::table_view({result.second->view()}), *rmap_sort_order); - -// column_wrapper lmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; -// column_wrapper rmap_gold{{NoneValue, NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; - -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(lmap_sorted->view().column(0), lmap_gold); -// CUDF_TEST_EXPECT_COLUMNS_EQUAL(rmap_sorted->view().column(0), rmap_gold); -// } +TEST_F(JoinDictionaryTest, LeftJoinNoNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 3}}; + strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); + auto col0_1 = cudf::dictionary::encode(col0_1_w); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; + auto col1_1 = cudf::dictionary::encode(col1_1_w); + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); + auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); + auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); + auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); + { + auto result = cudf::left_join(t0, t1, {0}, {0}); + auto result_view = result->view(); + auto decoded1 = cudf::dictionary::decode(result_view.column(1)); + auto decoded4 = cudf::dictionary::decode(result_view.column(4)); + std::vector result_decoded({result_view.column(0), + decoded1->view(), + result_view.column(2), + result_view.column(3), + decoded4->view(), + result_view.column(5)}); + + auto gold = cudf::left_join(g0, g1, {0}, {0}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); + } +} + +TEST_F(JoinDictionaryTest, LeftJoinWithNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; + auto col0_2 = cudf::dictionary::encode(col0_2_w); + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + auto col1_2 = cudf::dictionary::encode(col1_2_w); + + auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); + auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); + + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); + auto result_view = result->view(); + auto decoded2 = cudf::dictionary::decode(result_view.column(2)); + auto decoded5 = cudf::dictionary::decode(result_view.column(5)); + std::vector result_decoded({result_view.column(0), + result_view.column(1), + decoded2->view(), + result_view.column(3), + result_view.column(4), + decoded5->view()}); + + auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); + auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); + auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +} + +TEST_F(JoinDictionaryTest, InnerJoinNoNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1_w({"s1", "s1", "s0", "s4", "s0"}); + auto col0_1 = cudf::dictionary::encode(col0_1_w); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1_w({"s1", "s0", "s1", "s2", "s1"}); + auto col1_1 = cudf::dictionary::encode(col1_1_w); + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); + auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); + + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + auto result_view = result->view(); + auto decoded1 = cudf::dictionary::decode(result_view.column(1)); + auto decoded4 = cudf::dictionary::decode(result_view.column(4)); + std::vector result_decoded({result_view.column(0), + decoded1->view(), + result_view.column(2), + result_view.column(3), + decoded4->view(), + result_view.column(5)}); + + auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); + auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); + auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +} + +TEST_F(JoinDictionaryTest, InnerJoinWithNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col0_2_w{{0, 1, 2, 4, 1}}; + auto col0_2 = cudf::dictionary::encode(col0_2_w); + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + auto col1_2 = cudf::dictionary::encode(col1_2_w); + + auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); + auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); + + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); + auto result_view = result->view(); + auto decoded2 = cudf::dictionary::decode(result_view.column(2)); + auto decoded5 = cudf::dictionary::decode(result_view.column(5)); + std::vector result_decoded({result_view.column(0), + result_view.column(1), + decoded2->view(), + result_view.column(3), + result_view.column(4), + decoded5->view()}); + + auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); + auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); + auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +} + +TEST_F(JoinDictionaryTest, FullJoinNoNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 3}}; + strcol_wrapper col0_1_w({"s0", "s1", "s2", "s4", "s1"}); + auto col0_1 = cudf::dictionary::encode(col0_1_w); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1_w{{"s1", "s0", "s1", "s2", "s1"}}; + auto col1_1 = cudf::dictionary::encode(col1_1_w); + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); + auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); + + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); + auto result_view = result->view(); + auto decoded1 = cudf::dictionary::decode(result_view.column(1)); + auto decoded4 = cudf::dictionary::decode(result_view.column(4)); + std::vector result_decoded({result_view.column(0), + decoded1->view(), + result_view.column(2), + result_view.column(3), + decoded4->view(), + result_view.column(5)}); + + auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); + auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); + auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +} + +TEST_F(JoinDictionaryTest, FullJoinWithNulls) +{ + column_wrapper col0_0_w{{3, 1, 2, 0, 3}}; + auto col0_0 = cudf::dictionary::encode(col0_0_w); + strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0_w{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; + auto col1_0 = cudf::dictionary::encode(col1_0_w); + strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2}); + auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2}); + + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); + auto result_view = result->view(); + auto decoded0 = cudf::dictionary::decode(result_view.column(0)); + auto decoded3 = cudf::dictionary::decode(result_view.column(3)); + std::vector result_decoded({decoded0->view(), + result_view.column(1), + result_view.column(2), + decoded3->view(), + result_view.column(4), + result_view.column(5)}); + + auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); + auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); + auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); +} CUDF_TEST_PROGRAM_MAIN() From be560bbc3f58d50e3463b7fefae65e5e42b455c6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 18 Feb 2021 13:31:35 -0500 Subject: [PATCH 045/138] Python regressions --- cpp/benchmarks/join/join_benchmark.cu | 8 +-- python/cudf/cudf/_lib/cpp/join.pxd | 30 ++++-------- .../cudf/cudf/_lib/cpp/table/table_view.pxd | 1 + python/cudf/cudf/_lib/join.pyx | 49 +++++++++---------- 4 files changed, 36 insertions(+), 52 deletions(-) diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu index bd013afc451..fa6afdd908c 100644 --- a/cpp/benchmarks/join/join_benchmark.cu +++ b/cpp/benchmarks/join/join_benchmark.cu @@ -105,12 +105,8 @@ static void BM_join(benchmark::State &state) for (auto _ : state) { cuda_event_timer raii(state, true, 0); - auto result = cudf::inner_join(probe_table, - build_table, - columns_to_join, - columns_to_join, - {{0, 0}}, - cudf::null_equality::UNEQUAL); + auto result = cudf::inner_join( + probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL); } } diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd index 55180e2b74e..6ebde3934c3 100644 --- a/python/cudf/cudf/_lib/cpp/join.pxd +++ b/python/cudf/cudf/_lib/cpp/join.pxd @@ -14,36 +14,26 @@ from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/join.hpp" namespace "cudf" nogil: cdef pair[unique_ptr[column], unique_ptr[column]] inner_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on + const table_view left_keys, + const table_view right_keys, ) except + cdef pair[unique_ptr[column], unique_ptr[column]] left_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on + const table_view left_keys, + const table_view right_keys, ) except + cdef pair[unique_ptr[column], unique_ptr[column]] full_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on + const table_view left_keys, + const table_view right_keys, ) except + cdef unique_ptr[column] left_semi_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, + const table_view left_keys, + const table_view right_keys, ) except + cdef unique_ptr[column] left_anti_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, + const table_view left_keys, + const table_view right_keys, ) except + diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/cpp/table/table_view.pxd index 2f386d337cd..7bbfa69836c 100644 --- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd +++ b/python/cudf/cudf/_lib/cpp/table/table_view.pxd @@ -15,6 +15,7 @@ cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil: column_view column(size_type column_index) except + size_type num_columns() except + size_type num_rows() except + + table_view select(vector[size_type] column_indices) except + cdef cppclass mutable_table_view: mutable_table_view() except + diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 41b59e3d2e7..f31e75f94a8 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -13,6 +13,7 @@ from cudf._lib.column cimport Column from cudf._lib.table cimport Table, columns_from_ptr from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join @@ -20,32 +21,32 @@ cimport cudf._lib.cpp.join as cpp_join cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): # left, inner and outer join - cdef vector[int] c_left_on = left_on - cdef vector[int] c_right_on = right_on + cdef vector[size_type] c_left_on = left_on + cdef vector[size_type] c_right_on = right_on cdef pair[unique_ptr[column], unique_ptr[column]] c_result - cdef table_view c_lhs = lhs.view() - cdef table_view c_rhs = rhs.view() + cdef table_view c_lhs = lhs.view().select(c_left_on) + cdef table_view c_rhs = rhs.view().select(c_right_on) if how == "inner": - c_result = move(cpp_join.inner_join( - c_lhs, - c_rhs, - c_left_on, - c_right_on, - )) + if c_lhs.num_rows() < c_rhs.num_rows(): + c_result = move(cpp_join.inner_join( + c_rhs, + c_lhs + )) + else: + c_result = move(cpp_join.inner_join( + c_lhs, + c_rhs + )) elif how == "left": c_result = move(cpp_join.left_join( c_lhs, - c_rhs, - c_left_on, - c_right_on, + c_rhs )) elif how == "outer": c_result = move(cpp_join.full_join( c_lhs, - c_rhs, - c_left_on, - c_right_on + c_rhs )) else: raise ValueError(f"Unkown join type {how}") @@ -57,25 +58,21 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): cpdef semi_join(Table lhs, Table rhs, left_on, right_on, how=None): # left-semi and left-anti joins - cdef vector[int] c_left_on = left_on - cdef vector[int] c_right_on = right_on + cdef vector[size_type] c_left_on = left_on + cdef vector[size_type] c_right_on = right_on cdef unique_ptr[column] c_result - cdef table_view c_lhs = lhs.view() - cdef table_view c_rhs = rhs.view() + cdef table_view c_lhs = lhs.view().select(c_left_on) + cdef table_view c_rhs = rhs.view().select(c_right_on) if how == "leftsemi": c_result = move(cpp_join.left_semi_join( c_lhs, - c_rhs, - c_left_on, - c_right_on + c_rhs )) elif how == "leftanti": c_result = move(cpp_join.left_anti_join( c_lhs, - c_rhs, - c_left_on, - c_right_on + c_rhs )) else: raise ValueError(f"Invalid join type {how}") From efb60d6c01d7113152a188ab6d22de050b9cc175 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 18 Feb 2021 14:32:08 -0500 Subject: [PATCH 046/138] Revert --- python/cudf/cudf/_lib/join.pyx | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index f31e75f94a8..4cf07dd7e99 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -28,16 +28,10 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): cdef table_view c_rhs = rhs.view().select(c_right_on) if how == "inner": - if c_lhs.num_rows() < c_rhs.num_rows(): - c_result = move(cpp_join.inner_join( - c_rhs, - c_lhs - )) - else: - c_result = move(cpp_join.inner_join( - c_lhs, - c_rhs - )) + c_result = move(cpp_join.inner_join( + c_lhs, + c_rhs + )) elif how == "left": c_result = move(cpp_join.left_join( c_lhs, From fe6d0b8e4d89a87b48c482a5441e9b0b16f0447d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 18 Feb 2021 14:32:58 -0500 Subject: [PATCH 047/138] Invalid -> Unkown --- python/cudf/cudf/_lib/join.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 4cf07dd7e99..20c14f00957 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -43,7 +43,7 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): c_rhs )) else: - raise ValueError(f"Unkown join type {how}") + raise ValueError(f"Invalid join type {how}") return ( Column.from_unique_ptr(move(c_result.first)), Column.from_unique_ptr(move(c_result.second)) From 547027c39e7ab9992ae60b7859f7a8e563504815 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 18 Feb 2021 15:28:08 -0500 Subject: [PATCH 048/138] Don't mutate lhs/rhs --- python/cudf/cudf/core/join/join.py | 149 +++++++++++++---------------- 1 file changed, 66 insertions(+), 83 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 41830b7a80f..72d6d8588c3 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -9,61 +9,37 @@ ) -class _MISSING_TYPE: - pass - - -MISSING = _MISSING_TYPE() - - class ColumnView: - # A ColumnView represents one column of a Series - # or DataFrame - either an index column or a - # data column - - # we need a different sentinel value than `None` - # because `None` is totally a valid index/column name - def __init__(self, obj, column=MISSING, index=MISSING): - self.obj = obj + def __init__(self, name, column=False, index=False): + self.name = name self.column, self.index = column, index - def get_numeric_index(self): + def get_numeric_index(self, obj): # get the position of the column (including any index columns) - if self.index is MISSING: - index_nlevels = ( - self.obj.index.nlevels if self.obj._index is not None else 0 - ) - return index_nlevels + tuple(self.obj._data).index(self.column) + if self.column: + index_nlevels = obj.index.nlevels if obj._index is not None else 0 + return index_nlevels + tuple(obj._data).index(self.name) else: - return self.obj.index.names.index(self.index) + return obj.index.names.index(self.name) @property def is_index_level(self): # True if this is an index column - return self.index is not MISSING + return self.index - @property - def name(self): - # get the name of the column - if self.index is MISSING: - return self.column - else: - return self.index - - @property - def value(self): + def value(self, obj): # get the column - if self.index is MISSING: - return self.obj._data[self.name] + if self.column: + return obj._data[self.name] else: - return self.obj._index._data[self.name] + return obj._index._data[self.name] - def set_value(self, value): + def set_value(self, obj, value): # set the colum - if self.index is MISSING: - self.obj._data[self.name] = value + if self.column: + obj._data[self.name] = value else: - self.obj._index._data[self.name] = value + obj._index._data[self.name] = value JoinKeys = namedtuple("JoinKeys", ["left", "right"]) @@ -234,7 +210,7 @@ def compute_join_keys(self): if self.left_index: left_keys.extend( [ - ColumnView(obj=self.lhs, index=on) + ColumnView(name=on, index=True) for on in self.lhs.index.names ] ) @@ -242,14 +218,14 @@ def compute_join_keys(self): # TODO: require left_on or left_index to be specified left_keys.extend( [ - ColumnView(obj=self.lhs, column=on) + ColumnView(name=on, column=True) for on in _coerce_to_tuple(self.left_on) ] ) if self.right_index: right_keys.extend( [ - ColumnView(obj=self.rhs, index=on) + ColumnView(name=on, index=True) for on in self.rhs.index.names ] ) @@ -257,7 +233,7 @@ def compute_join_keys(self): # TODO: require right_on or right_index to be specified right_keys.extend( [ - ColumnView(obj=self.rhs, column=on) + ColumnView(name=on, column=True) for on in _coerce_to_tuple(self.right_on) ] ) @@ -269,12 +245,8 @@ def compute_join_keys(self): if self.on is not None else set(self.lhs._data.keys()) & set(self.rhs._data.keys()) ) - left_keys = [ - ColumnView(obj=self.lhs, column=on) for on in on_names - ] - right_keys = [ - ColumnView(obj=self.rhs, column=on) for on in on_names - ] + left_keys = [ColumnView(name=on, column=True) for on in on_names] + right_keys = [ColumnView(name=on, column=True) for on in on_names] if len(left_keys) != len(right_keys): raise ValueError( @@ -284,36 +256,42 @@ def compute_join_keys(self): self._keys = JoinKeys(left=left_keys, right=right_keys) def perform_merge(self): - self.match_key_dtypes(_input_to_libcudf_castrules_any) + lhs, rhs = self.match_key_dtypes( + self.lhs, self.rhs, _input_to_libcudf_castrules_any + ) - left_key_indices = [key.get_numeric_index() for key in self._keys.left] + left_key_indices = [ + key.get_numeric_index(lhs) for key in self._keys.left + ] right_key_indices = [ - key.get_numeric_index() for key in self._keys.right + key.get_numeric_index(rhs) for key in self._keys.right ] left_rows, right_rows = libcudf.join.join( - self.lhs, - self.rhs, + lhs, + rhs, left_on=left_key_indices, right_on=right_key_indices, how=self.how, ) - return self.construct_result(left_rows, right_rows) + return self.construct_result(lhs, rhs, left_rows, right_rows) - def construct_result(self, left_rows, right_rows): - self.match_key_dtypes(_libcudf_to_output_castrules) + def construct_result(self, lhs, rhs, left_rows, right_rows): + lhs, rhs = self.match_key_dtypes( + lhs, rhs, _libcudf_to_output_castrules + ) # first construct the index. if self.left_index and self.right_index: if self.how == "right": - out_index = self.rhs.index._gather(left_rows, nullify=True) + out_index = rhs.index._gather(left_rows, nullify=True) else: - out_index = self.lhs.index._gather(left_rows, nullify=True) + out_index = lhs.index._gather(left_rows, nullify=True) elif self.left_index: # left_index and right_on - out_index = self.rhs.index._gather(right_rows, nullify=True) + out_index = rhs.index._gather(right_rows, nullify=True) elif self.right_index: # right_index and left_on - out_index = self.lhs.index._gather(left_rows, nullify=True) + out_index = lhs.index._gather(left_rows, nullify=True) else: out_index = None @@ -322,11 +300,11 @@ def construct_result(self, left_rows, right_rows): left_names, right_names = self.output_column_names() for lcol in left_names: - data[left_names[lcol]] = self.lhs._data[lcol].take( + data[left_names[lcol]] = lhs._data[lcol].take( left_rows, nullify=True ) for rcol in right_names: - data[right_names[rcol]] = self.rhs._data[rcol].take( + data[right_names[rcol]] = rhs._data[rcol].take( right_rows, nullify=True ) @@ -336,16 +314,12 @@ def construct_result(self, left_rows, right_rows): if self.how == "outer": for lkey, rkey in zip(*self._keys): if lkey.name == rkey.name: - # get the key column as it appears in the result: - out_key = ColumnView( - result, column=lkey.column, index=lkey.index - ) - # fill nulls in the key column with values from the RHS - out_key.set_value( - out_key.value.fillna( - rkey.value.take(right_rows, nullify=True) - ) + lkey.set_value( + result, + lkey.value(result).fillna( + rkey.value(rhs).take(right_rows, nullify=True) + ), ) return self.sort_result(result) @@ -474,33 +448,42 @@ def validate_merge_params( "lsuffix and rsuffix are not defined" ) - def match_key_dtypes(self, match_func): + def match_key_dtypes(self, lhs, rhs, match_func): + out_lhs = lhs.copy(deep=False) + out_rhs = rhs.copy(deep=False) # match the dtypes of the key columns in # self.lhs and self.rhs according to the matching # function `match_func` for left_key, right_key in zip(*self._keys): - lcol, rcol = left_key.value, right_key.value + lcol, rcol = left_key.value(lhs), right_key.value(rhs) dtype = match_func(lcol, rcol, how=self.how) - left_key.set_value(lcol.astype(dtype)) - right_key.set_value(rcol.astype(dtype)) + left_key.set_value(out_lhs, lcol.astype(dtype)) + right_key.set_value(out_rhs, rcol.astype(dtype)) + return out_lhs, out_rhs class MergeSemi(MergeBase): def perform_merge(self): - self.match_key_dtypes(_input_to_libcudf_castrules_any) + lhs, rhs = self.match_key_dtypes( + self.lhs, self.rhs, _input_to_libcudf_castrules_any + ) - left_key_indices = [key.get_numeric_index() for key in self._keys.left] + left_key_indices = [ + key.get_numeric_index(lhs) for key in self._keys.left + ] right_key_indices = [ - key.get_numeric_index() for key in self._keys.right + key.get_numeric_index(rhs) for key in self._keys.right ] left_rows = libcudf.join.semi_join( - self.lhs, - self.rhs, + lhs, + rhs, left_on=left_key_indices, right_on=right_key_indices, how=self.how, ) - return self.construct_result(left_rows, cudf.core.column.as_column([])) + return self.construct_result( + lhs, rhs, left_rows, cudf.core.column.as_column([]) + ) def output_column_names(self): left_names, _ = super().output_column_names() From 5f93d23f24467e286612c0b4dae0244d35636607 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 19 Feb 2021 10:19:42 -0500 Subject: [PATCH 049/138] Fix join tests --- python/cudf/cudf/core/join/casting_logic.py | 2 ++ python/cudf/cudf/core/join/join.py | 16 ++++++++++------ python/cudf/cudf/tests/test_joining.py | 6 +++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/join/casting_logic.py b/python/cudf/cudf/core/join/casting_logic.py index acd8efca8a9..7638288f75e 100644 --- a/python/cudf/cudf/core/join/casting_logic.py +++ b/python/cudf/cudf/core/join/casting_logic.py @@ -183,6 +183,8 @@ def _libcudf_to_output_castrules(lcol, rcol, how): if pd.api.types.is_dtype_equal(ltype, rtype): return ltype + merge_return_type = _input_to_libcudf_castrules_any(lcol, rcol, how) + l_is_cat = isinstance(ltype, CategoricalDtype) r_is_cat = isinstance(rtype, CategoricalDtype) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 72d6d8588c3..414d8d0dff7 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -277,7 +277,7 @@ def perform_merge(self): def construct_result(self, lhs, rhs, left_rows, right_rows): lhs, rhs = self.match_key_dtypes( - lhs, rhs, _libcudf_to_output_castrules + self.lhs, self.rhs, _libcudf_to_output_castrules ) # first construct the index. @@ -331,9 +331,12 @@ def sort_result(self, result): # the key columns on the other side will be used to sort. if self.sort: if self.on: - return result.sort_values( - _coerce_to_list(self.on), ignore_index=True - ) + if isinstance(result, cudf.Index): + return result.sort_values() + else: + return result.sort_values( + _coerce_to_list(self.on), ignore_index=True + ) by = [] if self.left_index and self.right_index: by.extend(result.index._data.columns) @@ -457,8 +460,9 @@ def match_key_dtypes(self, lhs, rhs, match_func): for left_key, right_key in zip(*self._keys): lcol, rcol = left_key.value(lhs), right_key.value(rhs) dtype = match_func(lcol, rcol, how=self.how) - left_key.set_value(out_lhs, lcol.astype(dtype)) - right_key.set_value(out_rhs, rcol.astype(dtype)) + if dtype: + left_key.set_value(out_lhs, lcol.astype(dtype)) + right_key.set_value(out_rhs, rcol.astype(dtype)) return out_lhs, out_rhs diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d99897584ec..fbb12f897a3 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -265,7 +265,7 @@ def test_dataframe_join_mismatch_cats(how): expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) - assert_eq(expect, got) + assert_eq(expect, got, check_categorical=False) @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) @@ -1224,7 +1224,7 @@ def test_categorical_typecast_inner(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"], check_categorical=False) # Equal categories, unequal ordering -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) @@ -1242,7 +1242,7 @@ def test_categorical_typecast_inner(): expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False) expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"], check_categorical=False) # One is ordered -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) From b7bf82172ef87ab1eca0e0d66ea38fe483db5bd2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 19 Feb 2021 11:59:17 -0500 Subject: [PATCH 050/138] Fix semi/anti join trivial cases --- cpp/src/join/join.cu | 4 +++- cpp/src/join/semi_join.cu | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 9e61a924e03..15b1f216928 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -49,7 +49,9 @@ std::pair, std::unique_ptr> inner_jo // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); - return hj_obj.inner_join(right, compare_nulls, stream, mr); + auto result = hj_obj.inner_join(right, compare_nulls, stream, mr); + return std::make_pair, std::unique_ptr>( + std::move(result.second), std::move(result.first)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); return hj_obj.inner_join(left, compare_nulls, stream, mr); diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 919311f41bf..e5966dd01e4 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -46,6 +48,21 @@ std::unique_ptr left_semi_anti_join( CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty"); CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); + if (is_trivial_join(left_keys, right_keys, JoinKind)) { + return std::make_unique(cudf::data_type(type_to_id()), + 0, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0); + } + if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { + using ScalarType = cudf::scalar_type_t; + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + zero->set_valid(true, stream); + static_cast(zero.get())->set_value(0, stream); + return cudf::detail::sequence(left_keys.num_rows(), *zero, stream); + } + auto const left_num_rows = left_keys.num_rows(); auto const right_num_rows = right_keys.num_rows(); From 50a2fb2aab777e7a9b87d4d278ae560b6a95ef5d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 19 Feb 2021 16:39:56 -0500 Subject: [PATCH 051/138] When testing join results, use a helper that sorts values --- python/cudf/cudf/tests/test_joining.py | 161 ++++++++++++------------- python/cudf/cudf/tests/test_string.py | 40 +++--- 2 files changed, 96 insertions(+), 105 deletions(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index fbb12f897a3..367e903d02e 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -69,6 +69,35 @@ def pd_odd_joins(left, right, join_type): return left[left.index.isin(right.index)][left.columns] +def assert_join_results_equal(expect, got, how, **kwargs): + if how == "right": + got = got[expect.columns] + + if isinstance(expect, (pd.Series, cudf.Series)): + return assert_eq( + expect.sort_values().reset_index(drop=True), + got.sort_values().reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + if not len( + expect.columns + ): # can't sort_values() on a df without columns + return assert_eq(expect, got, **kwargs) + + return assert_eq( + expect.sort_values(expect.columns.to_list()).reset_index( + drop=True + ), + got.sort_values(got.columns.to_list()).reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.Index, cudf.Index)): + return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) + else: + raise ValueError(f"Not a join result: {type(expect).__name__}") + + @pytest.mark.parametrize("aa,bb,how,method", make_params()) def test_dataframe_join_how(aa, bb, how, method): df = DataFrame() @@ -113,12 +142,7 @@ def work_gdf(df): # TODO: What is the less hacky way? expect.index.name = "bob" got.index.name = "mary" - assert_eq( - got.sort_values(got.columns.to_list()).reset_index(drop=True), - expect.sort_values(expect.columns.to_list()).reset_index( - drop=True - ), - ) + assert_join_results_equal(expect, got, how=how) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) @@ -188,10 +212,7 @@ def test_dataframe_join_cats(): expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make an object Index after joining - assert_eq( - got.sort_values(by="b").sort_index().reset_index(drop=True), - expect.reset_index(drop=True), - ) + assert_join_results_equal(expect, got, how="inner") # Just do some rough checking here. assert list(got.columns) == ["b", "c"] @@ -265,7 +286,7 @@ def test_dataframe_join_mismatch_cats(how): expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) - assert_eq(expect, got, check_categorical=False) + assert_join_results_equal(expect, got, how=how, check_categorical=False) @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) @@ -324,7 +345,7 @@ def test_dataframe_merge_on(on): list(pddf_joined.columns) ).reset_index(drop=True) - assert_eq(cdf_result, pdf_result, check_like=True) + assert_join_results_equal(cdf_result, pdf_result, how="left") merge_func_result_cdf = ( join_result_cudf.to_pandas() @@ -332,7 +353,7 @@ def test_dataframe_merge_on(on): .reset_index(drop=True) ) - assert_eq(merge_func_result_cdf, cdf_result, check_like=True) + assert_join_results_equal(merge_func_result_cdf, cdf_result, how="left") def test_dataframe_merge_on_unknown_column(): @@ -384,7 +405,7 @@ def test_dataframe_empty_merge(): expect = DataFrame({"a": [], "b": [], "c": []}) got = gdf1.merge(gdf2, how="left", on=["a"]) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") def test_dataframe_merge_order(): @@ -409,7 +430,7 @@ def test_dataframe_merge_order(): df2["a"] = [7, 8] df = df1.merge(df2, how="left", on=["id", "a"]) - assert_eq(gdf, df) + assert_join_results_equal(df, gdf, how="left") @pytest.mark.parametrize( @@ -554,7 +575,7 @@ def test_merge_left_index_zero(): pd_merge = left.merge(right, left_on="x", right_on="y") gd_merge = gleft.merge(gright, left_on="x", right_on="y") - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") @pytest.mark.parametrize( @@ -575,7 +596,7 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") @pytest.mark.parametrize( @@ -596,7 +617,7 @@ def test_merge_left_right_index_left_right_on_kwargs(kwargs): gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") def test_indicator(): @@ -612,9 +633,10 @@ def test_indicator(): def test_merge_suffixes(): pdf = cudf.DataFrame({"x": [1, 2, 1]}) gdf = cudf.DataFrame({"x": [1, 2, 1]}) - assert_eq( + assert_join_results_equal( gdf.merge(gdf, suffixes=("left", "right")), pdf.merge(pdf, suffixes=("left", "right")), + how="left", ) assert_exceptions_equal( @@ -632,11 +654,14 @@ def test_merge_left_on_right_on(): gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) - assert_eq(left.merge(right, on="xx"), gleft.merge(gright, on="xx")) + assert_join_results_equal( + left.merge(right, on="xx"), gleft.merge(gright, on="xx"), how="left" + ) - assert_eq( + assert_join_results_equal( left.merge(right, left_on="xx", right_on="xx"), gleft.merge(gright, left_on="xx", right_on="xx"), + how="left", ) @@ -712,7 +737,9 @@ def test_merge_sort(ons, hows): pd_merge = left.merge(right, **kwargs) # require the join keys themselves to be sorted correctly # the non-key columns will NOT match pandas ordering - assert_eq(pd_merge[kwargs["on"]], gd_merge[kwargs["on"]]) + assert_join_results_equal( + pd_merge[kwargs["on"]], gd_merge[kwargs["on"]], how="left" + ) pd_merge = pd_merge.drop(kwargs["on"], axis=1) gd_merge = gd_merge.drop(kwargs["on"], axis=1) if not pd_merge.empty: @@ -724,7 +751,7 @@ def test_merge_sort(ons, hows): drop=True ) - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") @pytest.mark.parametrize( @@ -785,7 +812,7 @@ def test_join_datetimes_index(dtype): assert gdf["d"].dtype == np.dtype(dtype) - assert_eq(pdf, gdf) + assert_join_results_equal(pdf, gdf, how="inner") def test_join_with_different_names(): @@ -795,7 +822,7 @@ def test_join_with_different_names(): gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"]) - assert_eq(pd_merge, gd_merge.sort_values(by=["a"]).reset_index(drop=True)) + assert_join_results_equal(pd_merge, gd_merge, how="outer") def test_join_same_name_different_order(): @@ -805,9 +832,7 @@ def test_join_same_name_different_order(): gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"]) gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"]) - assert_eq( - pd_merge, gd_merge.sort_values(by=["a_x"]).reset_index(drop=True) - ) + assert_join_results_equal(pd_merge, gd_merge, how="left") def test_join_empty_table_dtype(): @@ -878,10 +903,7 @@ def test_join_multi(how, column_a, column_b, column_c): gdf_result = gdf_result[columns] pdf_result = pdf_result[columns] - assert_eq( - gdf_result.reset_index(drop=True).fillna(-1), - pdf_result.sort_index().reset_index(drop=True).fillna(-1), - ) + assert_join_results_equal(pdf_result, gdf_result, how="inner") @pytest.mark.parametrize( @@ -971,7 +993,7 @@ def test_merge_multi(kwargs): expect.index = range(len(expect)) got.index = range(len(got)) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize("dtype_l", INTEGER_TYPES) @@ -1001,7 +1023,7 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize("dtype_l", ["float32", "float64"]) @@ -1036,7 +1058,7 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize("dtype_l", NUMERIC_TYPES) @@ -1072,7 +1094,7 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def test_typecast_on_join_no_float_round(): @@ -1096,7 +1118,7 @@ def test_typecast_on_join_no_float_round(): got = gdf_l.merge(gdf_r, on="join_col", how="left") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize( @@ -1169,7 +1191,7 @@ def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize("dtype_l", ["category", "str", "int32", "float32"]) @@ -1204,7 +1226,7 @@ def test_typecast_on_join_categorical(dtype_l, dtype_r): ) got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def make_categorical_dataframe(categories, ordered=False): @@ -1431,20 +1453,10 @@ def test_index_join(lhs, rhs, how, level): g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index - expected = ( - p_lhs.join(p_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) - got = ( - g_lhs.join(g_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) def test_index_join_corner_cases(): @@ -1465,20 +1477,10 @@ def test_index_join_corner_cases(): p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index - expected = ( - p_lhs.join(p_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) - got = ( - g_lhs.join(g_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) # sort is supported only in case of two non-MultiIndex join # Join when column name doesn't match with level @@ -1494,7 +1496,7 @@ def test_index_join_corner_cases(): expected = p_lhs.join(p_rhs, how=how, sort=True) got = g_lhs.join(g_rhs, how=how, sort=True) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) # Pandas Index.join on categorical column returns generic column # but cudf will be returning a categorical column itself. @@ -1508,22 +1510,12 @@ def test_index_join_corner_cases(): p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index - expected = ( - p_lhs.join(p_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) - got = ( - g_lhs.join(g_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) got["a"] = got["a"].astype(expected["a"].dtype) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) def test_index_join_exception_cases(): @@ -1575,7 +1567,7 @@ def test_typecast_on_join_indexes(): got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def test_typecast_on_join_multiindices(): @@ -1626,7 +1618,7 @@ def test_typecast_on_join_multiindices(): expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"]) got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def test_typecast_on_join_indexes_matching_categorical(): @@ -1653,7 +1645,7 @@ def test_typecast_on_join_indexes_matching_categorical(): expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize( @@ -1705,9 +1697,10 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): expect = check_lhs.merge(check_rhs, how=how, **kwargs) got = lhs.merge(rhs, how=how, **kwargs) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how) +@pytest.mark.xfail(reason="Cannot sort values of list dtype") @pytest.mark.parametrize( "how", ["left", "inner", "right", "leftanti", "leftsemi"] ) @@ -1732,4 +1725,4 @@ def test_merge_with_lists(how): expect = pd_left.merge(pd_right, on="a") got = gd_left.merge(gd_right, on="a") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 080420c8f75..5e9d75c7568 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -14,6 +14,7 @@ from cudf.core import DataFrame, Series from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index +from cudf.tests.test_joining import assert_join_results_equal from cudf.tests.utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -861,16 +862,12 @@ def test_string_split(data, pat, n, expand): @pytest.mark.parametrize( - "str_data,str_data_raise", - [ - ([], 0), - (["a", "b", "c", "d", "e"], 0), - ([None, None, None, None, None], 1), - ], + "str_data", + [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], ) @pytest.mark.parametrize("num_keys", [1, 2, 3]) @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_key(str_data, str_data_raise, num_keys, how): +def test_string_join_key(str_data, num_keys, how): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() @@ -884,19 +881,17 @@ def test_string_join_key(str_data, str_data_raise, num_keys, how): pdf2 = pdf.copy() gdf2 = gdf.copy() - expectation = raise_builder( - [0 if how == "right" else str_data_raise], (AssertionError) - ) + expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) + got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) - with expectation: - expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) - got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] # reorder columns - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] + if how == "right": + got = got[expect.columns] # reorder columns - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how) @pytest.mark.parametrize( @@ -940,7 +935,7 @@ def test_string_join_key_nulls(str_data_nulls): expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize( @@ -969,7 +964,10 @@ def test_string_join_non_key(str_data, num_cols, how): expect = expect.reset_index(drop=True) got = got[expect.columns] - assert_eq(expect, got) + if how == "right": + got = got[expect.columns] # reorder columns + + assert_join_results_equal(expect, got, how=how) @pytest.mark.parametrize( @@ -1010,7 +1008,7 @@ def test_string_join_non_key_nulls(str_data_nulls): expect = expect.reset_index(drop=True) got = got[expect.columns] - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") def test_string_join_values_nulls(): @@ -1050,7 +1048,7 @@ def test_string_join_values_nulls(): expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize( From ff0ae791595c8840f6c3857a7c876657c298a71e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 19 Feb 2021 18:59:36 -0500 Subject: [PATCH 052/138] Totally broken commit --- python/cudf/cudf/core/frame.py | 39 ++-- python/cudf/cudf/core/join/__init__.py | 2 +- python/cudf/cudf/core/join/_join_helpers.py | 225 ++++++++++++++++++++ python/cudf/cudf/core/join/casting_logic.py | 209 ------------------ python/cudf/cudf/core/join/join.py | 178 +++++----------- 5 files changed, 300 insertions(+), 353 deletions(-) create mode 100644 python/cudf/cudf/core/join/_join_helpers.py delete mode 100644 python/cudf/cudf/core/join/casting_logic.py diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ccbf2cd10b6..1dfb65ace38 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3399,29 +3399,24 @@ def _merge( indicator=indicator, suffixes=suffixes, ) - - lhs = self - rhs = right - - from cudf.core.join import Merge - - mergeop = Merge( - lhs, - rhs, - on, - left_on, - right_on, - left_index, - right_index, - how, - sort, - lsuffix, - rsuffix, - method, - indicator, - suffixes, + from cudf.core.join.join import merge + + return merge( + self, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + sort=sort, + lsuffix=lsuffix, + rsuffix=rsuffix, + method=method, + indicator=indicator, + suffixes=suffixes, ) - return mergeop.perform_merge() def _is_sorted(self, ascending=None, null_position=None): """ diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py index 6d126c8af4d..0463b8f9df1 100644 --- a/python/cudf/cudf/core/join/__init__.py +++ b/python/cudf/cudf/core/join/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2020, NVIDIA CORPORATION. -from cudf.core.join.join import Merge +from cudf.core.join.join import merge diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py new file mode 100644 index 00000000000..68c3e33ac25 --- /dev/null +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import warnings + +import numpy as np +import pandas as pd + +import cudf +from cudf.core.dtypes import CategoricalDtype + + +class _Indexer: + # Indexer into a column (either a data column or index level). + # + # >>> df + # a + # b + # 4 1 + # 5 2 + # 6 3 + # >>> _Indexer("a", column=True).value(df) # returns column "a" of df + # >>> _Indexer("b", index=True).value(df) # returns index level "b" of df + + def __init__(self, name, column=False, index=False): + self.name = name + self.column, self.index = column, index + + def value(self, obj): + # get the column from `obj` + if self.column: + return obj._data[self.name] + else: + return obj._index._data[self.name] + + def set_value(self, obj, value): + # set the colum in `obj` + if self.column: + obj._data[self.name] = value + else: + obj._index._data[self.name] = value + + def get_numeric_index(self, obj): + # get the position of the column in `obj` + # (counting any index columns) + if self.column: + index_nlevels = obj.index.nlevels if obj._index is not None else 0 + return index_nlevels + tuple(obj._data).index(self.name) + else: + return obj.index.names.index(self.name) + + +def _coerce_to_tuple(obj): + if hasattr(obj, "__iter__") and not isinstance(obj, str): + return tuple(obj) + else: + return (obj,) + + +def _coerce_to_list(obj): + return list(_coerce_to_tuple(obj)) + + +def _cast_join_categorical_keys_both(lcol, rcol, how): + # cast lcol and rcol to a common type when they are *both* + # categorical types. + # + # The commontype depends on both `how` and the specifics of the + # categorical variables to be merged. + + ltype, rtype = lcol.dtype, rcol.dtype + + # when both are ordered and both have the same categories, + # no casting required: + if ltype == rtype: + return lcol, rcol + + # Merging categorical variables when only one side is ordered is + # ambiguous and not allowed. + if ltype.ordered != rtype.ordered: + raise TypeError( + "Merging on categorical variables with mismatched" + " ordering is ambiguous" + ) + + if ltype.ordered and rtype.ordered: + # if we get to here, categories must be what causes the + # dtype equality check to fail. And we can never merge + # two ordered categoricals with different categories + raise TypeError( + f"{how} merge between categoricals with " + "different categories is only valid when " + "neither side is ordered" + ) + + # the following should now always hold + assert not ltype.ordered and not rtype.ordered + + if how == "inner": + # demote to underlying types -- we will promote them back later + return _cast_join_keys(ltype.categories, rtype.categories, how) + elif how in {"left", "leftanti", "leftsemi"}: + # always cast to left type + return lcol, rcol.astype(ltype) + else: + # merge categories + merged_categories = cudf.concat( + [ltype.categories, rtype.categories] + ).unique() + common_type = cudf.CategoricalDtype( + categories=merged_categories, ordered=False + ) + return lcol.astype(common_type), rcol.astype(common_type) + + +def _cast_join_categorical_keys(lcol, rcol, how): + # cast the keys lcol and rcol to a common dtype + # when at least one of them is a categorical type + + l_is_cat = isinstance(lcol.dtype, CategoricalDtype) + r_is_cat = isinstance(rcol.dtype, CategoricalDtype) + + if l_is_cat and r_is_cat: + # if both are categoricals, logic is complicated: + return _cast_join_categorical_keys_both(lcol, rcol, how) + elif l_is_cat or r_is_cat: + if l_is_cat and how in {"left", "leftsemi", "leftanti"}: + return (lcol, rcol.astype(lcol.dtype)) + common_type = ( + lcol.dtype.categories.dtype + if l_is_cat + else rcol.dtype.categories.dtype + ) + return lcol.astype(common_type), rcol.astype(common_type) + else: + raise ValueError("Neither operand is categorical") + + +def _cast_join_keys(lcol, rcol, how): + # cast the keys lcol and rcol to a common dtype + + ltype = lcol.dtype + rtype = rcol.dtype + + # if either side is categorical, different logic + if isinstance(ltype, CategoricalDtype) or isinstance( + rtype, CategoricalDtype + ): + return _cast_join_categorical_keys(lcol, rcol, how) + + if pd.api.types.is_dtype_equal(ltype, rtype): + return lcol, rcol + + if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): + common_type = ( + max(ltype, type) + if ltype.kind == rtype.kind + else np.find_common_type([], (ltype, rtype)) + ) + + elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( + rtype, np.datetime64 + ): + common_type = max(ltype, rtype) + + if how == "left": + if rcol.fillna(0).can_cast_safely(ltype): + return lcol, rcol.astype(ltype) + else: + warnings.warn( + "Can't safely cast column from {rtype} to {ltype}, " + "upcasting to {common_type}." + ) + + if common_type: + lcol, rcol = lcol.astype(common_type), rcol.astype(common_type) + + return lcol, rcol + + +def _libcudf_to_output_castrules(lcol, rcol, how): + """ + Determine what dtype an output merge key column should be + cast to after it has been processed by libcudf. Determine + if a column should be promoted to a categorical datatype. + For inner merges between unordered categoricals, we get a + new categorical variable containing the intersection of + the two source variables. For left or right joins, we get + the original categorical variable from whichever was the + major operand of the join, e.g. left for a left join or + right for a right join. In the case of an outer join, the + result will be a new categorical variable with both sets + of categories. + """ + merge_return_type = None + + ltype = lcol.dtype + rtype = rcol.dtype + + if pd.api.types.is_dtype_equal(ltype, rtype): + return ltype + + merge_return_type = _cast_join_keys(lcol, rcol, how) + + l_is_cat = isinstance(ltype, CategoricalDtype) + r_is_cat = isinstance(rtype, CategoricalDtype) + + # we currently only need to do this for categorical variables + if how == "inner": + if l_is_cat and r_is_cat: + merge_return_type = "category" + elif how in {"left", "leftsemi", "leftanti"}: + if l_is_cat: + merge_return_type = ltype + elif how == "right": + if r_is_cat: + merge_return_type = rtype + elif how == "outer": + if l_is_cat and r_is_cat: + new_cats = cudf.concat( + [ltype.categories, rtype.categories] + ).unique() + merge_return_type = cudf.CategoricalDtype( + categories=new_cats, ordered=ltype.ordered + ) + return merge_return_type diff --git a/python/cudf/cudf/core/join/casting_logic.py b/python/cudf/cudf/core/join/casting_logic.py deleted file mode 100644 index 7638288f75e..00000000000 --- a/python/cudf/cudf/core/join/casting_logic.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -import warnings - -import numpy as np -import pandas as pd - -import cudf -from cudf.core.dtypes import CategoricalDtype - - -def _input_to_libcudf_castrules_both_cat(lcol, rcol, how): - """ - Based off the left and right operands, determine the libcudf - merge dtype or error for corner cases where the merge cannot - proceed. This function handles categorical variables. - Categorical variable typecasting logic depends on both `how` - and the specifics of the categorical variables to be merged. - Merging categorical variables when only one side is ordered - is ambiguous and not allowed. Merging when both categoricals - are ordered is allowed, but only when the categories are - exactly equal and have equal ordering, and will result in the - common dtype. - When both sides are unordered, the result categorical depends - on the kind of join: - - For inner joins, the result will be the intersection of the - categories - - For left or right joins, the result will be the the left or - right dtype respectively. This extends to semi and anti joins. - - For outer joins, the result will be the union of categories - from both sides. - - """ - ltype = lcol.dtype - rtype = rcol.dtype - - # this function is only to be used to resolve the result when both - # sides are categorical - if not isinstance(ltype, CategoricalDtype) and isinstance( - rtype, CategoricalDtype - ): - raise TypeError("Both operands must be CategoricalDtype") - - # true for every configuration - if ltype == rtype: - return ltype - - # raise for any join where ordering doesn't match - if ltype.ordered != rtype.ordered: - raise TypeError( - "Merging on categorical variables with mismatched" - " ordering is ambiguous" - ) - elif ltype.ordered and rtype.ordered: - # if we get to here, categories must be what causes the - # dtype equality check to fail. And we can never merge - # two ordered categoricals with different categories - raise TypeError( - f"{how} merge between categoricals with " - "different categories is only valid when " - "neither side is ordered" - ) - - elif how == "inner": - # neither ordered, so categories must be different - # demote to underlying types - return _input_to_libcudf_castrules_any( - ltype.categories, rtype.categories, how - ) - - elif how == "left": - return ltype - elif how == "right": - return rtype - - elif how == "outer": - new_cats = cudf.concat([ltype.categories, rtype.categories]).unique() - return cudf.CategoricalDtype(categories=new_cats, ordered=False) - - -def _input_to_libcudf_castrules_any_cat(lcol, rcol, how): - - l_is_cat = isinstance(lcol.dtype, CategoricalDtype) - r_is_cat = isinstance(rcol.dtype, CategoricalDtype) - - if l_is_cat and r_is_cat: - return _input_to_libcudf_castrules_both_cat(lcol, rcol, how) - elif l_is_cat or r_is_cat: - if l_is_cat and how == "left": - return lcol.dtype - if r_is_cat and how == "right": - return rcol.dtype - return ( - lcol.dtype.categories.dtype - if l_is_cat - else rcol.dtype.categories.dtype - ) - else: - raise ValueError("Neither operand is categorical") - - -def _input_to_libcudf_castrules_any(lcol, rcol, how): - """ - Determine what dtype the left and right hand - input columns must be cast to for a libcudf - join to proceed. - """ - - cast_warn = ( - "can't safely cast column from {} with type" - " {} to {}, upcasting to {}" - ) - - ltype = lcol.dtype - rtype = rcol.dtype - - # if either side is categorical, different logic - if isinstance(ltype, CategoricalDtype) or isinstance( - rtype, CategoricalDtype - ): - return _input_to_libcudf_castrules_any_cat(lcol, rcol, how) - - libcudf_join_type = None - if pd.api.types.is_dtype_equal(ltype, rtype): - libcudf_join_type = ltype - elif how == "left": - check_col = rcol.fillna(0) - if not check_col.can_cast_safely(ltype): - libcudf_join_type = _input_to_libcudf_castrules_any( - lcol, rcol, "inner" - ) - warnings.warn( - cast_warn.format("right", rtype, ltype, libcudf_join_type) - ) - else: - libcudf_join_type = ltype - elif how == "right": - check_col = lcol.fillna(0) - if not check_col.can_cast_safely(rtype): - libcudf_join_type = _input_to_libcudf_castrules_any( - lcol, rcol, "inner" - ) - warnings.warn( - cast_warn.format("left", ltype, rtype, libcudf_join_type) - ) - else: - libcudf_join_type = rtype - elif how in {"inner", "outer"}: - if (np.issubdtype(ltype, np.number)) and ( - np.issubdtype(rtype, np.number) - ): - if ltype.kind == rtype.kind: - # both ints or both floats - libcudf_join_type = max(ltype, rtype) - else: - libcudf_join_type = np.find_common_type([], [ltype, rtype]) - elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( - rtype, np.datetime64 - ): - libcudf_join_type = max(ltype, rtype) - return libcudf_join_type - - -def _libcudf_to_output_castrules(lcol, rcol, how): - """ - Determine what dtype an output merge key column should be - cast to after it has been processed by libcudf. Determine - if a column should be promoted to a categorical datatype. - For inner merges between unordered categoricals, we get a - new categorical variable containing the intersection of - the two source variables. For left or right joins, we get - the original categorical variable from whichever was the - major operand of the join, e.g. left for a left join or - right for a right join. In the case of an outer join, the - result will be a new categorical variable with both sets - of categories. - """ - merge_return_type = None - - ltype = lcol.dtype - rtype = rcol.dtype - - if pd.api.types.is_dtype_equal(ltype, rtype): - return ltype - - merge_return_type = _input_to_libcudf_castrules_any(lcol, rcol, how) - - l_is_cat = isinstance(ltype, CategoricalDtype) - r_is_cat = isinstance(rtype, CategoricalDtype) - - # we currently only need to do this for categorical variables - if how == "inner": - if l_is_cat and r_is_cat: - merge_return_type = "category" - elif how == "left": - if l_is_cat: - merge_return_type = ltype - elif how == "right": - if r_is_cat: - merge_return_type = rtype - elif how == "outer": - if l_is_cat and r_is_cat: - new_cats = cudf.concat( - [ltype.categories, rtype.categories] - ).unique() - merge_return_type = cudf.CategoricalDtype( - categories=new_cats, ordered=ltype.ordered - ) - return merge_return_type diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 414d8d0dff7..dcb06d3aee2 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -3,105 +3,62 @@ import cudf from cudf import _lib as libcudf -from cudf.core.join.casting_logic import ( - _input_to_libcudf_castrules_any, +from cudf.core.join._join_helpers import ( + _cast_join_keys, + _coerce_to_list, + _coerce_to_tuple, + _Indexer, _libcudf_to_output_castrules, ) -class ColumnView: - def __init__(self, name, column=False, index=False): - self.name = name - self.column, self.index = column, index - - def get_numeric_index(self, obj): - # get the position of the column (including any index columns) - if self.column: - index_nlevels = obj.index.nlevels if obj._index is not None else 0 - return index_nlevels + tuple(obj._data).index(self.name) - else: - return obj.index.names.index(self.name) - - @property - def is_index_level(self): - # True if this is an index column - return self.index - - def value(self, obj): - # get the column - if self.column: - return obj._data[self.name] - else: - return obj._index._data[self.name] - - def set_value(self, obj, value): - # set the colum - if self.column: - obj._data[self.name] = value - else: - obj._index._data[self.name] = value - - -JoinKeys = namedtuple("JoinKeys", ["left", "right"]) - - -def Merge( +def merge( lhs, rhs, - on=None, - left_on=None, - right_on=None, - left_index=False, - right_index=False, - how="inner", - sort=False, - lsuffix="_x", - rsuffix="_y", - method=None, - indicator=None, - suffixes=None, + *, + on, + left_on, + right_on, + left_index, + right_index, + how, + sort, + lsuffix, + rsuffix, + method, + indicator, + suffixes, ): - if how not in {"leftsemi", "leftanti"}: - return MergeBase( - lhs, - rhs, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - how=how, - sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, - method=method, - indicator=indicator, - suffixes=suffixes, - ) + if how in {"leftsemi", "leftanti"}: + merge_cls = MergeSemi else: - return MergeSemi( - lhs, - rhs, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - how=how, - sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, - method=method, - indicator=indicator, - suffixes=suffixes, - ) - + merge_cls = Merge + mergeobj = merge_cls( + lhs, + rhs, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + sort=sort, + lsuffix=lsuffix, + rsuffix=rsuffix, + method=method, + indicator=indicator, + ) + return mergeobj.perform_merge() + + +class Merge(object): + JoinKeys = namedtuple("JoinKeys", ["left", "right"]) -class MergeBase(object): def __init__( self, lhs, rhs, + *, on=None, left_on=None, right_on=None, @@ -171,11 +128,8 @@ def __init__( suffixes=suffixes, ) - # warning: self.lhs and self.rhs are mutated both before - # and after the join - self.lhs = lhs.copy(deep=False) - self.rhs = rhs.copy(deep=False) - + self.lhs = lhs + self.rhs = rhs self.on = on self.left_on = left_on self.right_on = right_on @@ -188,6 +142,7 @@ def __init__( self.suffixes = suffixes self.out_class = cudf.DataFrame + if isinstance(self.lhs, cudf.MultiIndex) or isinstance( self.rhs, cudf.MultiIndex ): @@ -198,7 +153,6 @@ def __init__( self.compute_join_keys() def compute_join_keys(self): - if ( self.left_index or self.right_index @@ -210,7 +164,7 @@ def compute_join_keys(self): if self.left_index: left_keys.extend( [ - ColumnView(name=on, index=True) + _Indexer(name=on, index=True) for on in self.lhs.index.names ] ) @@ -218,14 +172,14 @@ def compute_join_keys(self): # TODO: require left_on or left_index to be specified left_keys.extend( [ - ColumnView(name=on, column=True) + _Indexer(name=on, column=True) for on in _coerce_to_tuple(self.left_on) ] ) if self.right_index: right_keys.extend( [ - ColumnView(name=on, index=True) + _Indexer(name=on, index=True) for on in self.rhs.index.names ] ) @@ -233,7 +187,7 @@ def compute_join_keys(self): # TODO: require right_on or right_index to be specified right_keys.extend( [ - ColumnView(name=on, column=True) + _Indexer(name=on, column=True) for on in _coerce_to_tuple(self.right_on) ] ) @@ -245,20 +199,18 @@ def compute_join_keys(self): if self.on is not None else set(self.lhs._data.keys()) & set(self.rhs._data.keys()) ) - left_keys = [ColumnView(name=on, column=True) for on in on_names] - right_keys = [ColumnView(name=on, column=True) for on in on_names] + left_keys = [_Indexer(name=on, column=True) for on in on_names] + right_keys = [_Indexer(name=on, column=True) for on in on_names] if len(left_keys) != len(right_keys): raise ValueError( "Merge operands must have same number of join key columns" ) - self._keys = JoinKeys(left=left_keys, right=right_keys) + self._keys = self.__class__.JoinKeys(left=left_keys, right=right_keys) def perform_merge(self): - lhs, rhs = self.match_key_dtypes( - self.lhs, self.rhs, _input_to_libcudf_castrules_any - ) + lhs, rhs = self.match_key_dtypes(self.lhs, self.rhs, _cast_join_keys) left_key_indices = [ key.get_numeric_index(lhs) for key in self._keys.left @@ -376,10 +328,7 @@ def output_column_names(self): else: key_columns_with_same_name = [] for lkey, rkey in zip(*self._keys): - if (lkey.is_index_level, rkey.is_index_level) == ( - False, - False, - ): + if (lkey.index, rkey.index) == (False, False,): if lkey.name == rkey.name: key_columns_with_same_name.append(lkey.name) for name in common_names: @@ -466,11 +415,9 @@ def match_key_dtypes(self, lhs, rhs, match_func): return out_lhs, out_rhs -class MergeSemi(MergeBase): +class MergeSemi(Merge): def perform_merge(self): - lhs, rhs = self.match_key_dtypes( - self.lhs, self.rhs, _input_to_libcudf_castrules_any - ) + lhs, rhs = self.match_key_dtypes(self.lhs, self.rhs, _cast_join_keys) left_key_indices = [ key.get_numeric_index(lhs) for key in self._keys.left @@ -492,14 +439,3 @@ def perform_merge(self): def output_column_names(self): left_names, _ = super().output_column_names() return left_names, {} - - -def _coerce_to_tuple(obj): - if hasattr(obj, "__iter__") and not isinstance(obj, str): - return tuple(obj) - else: - return (obj,) - - -def _coerce_to_list(obj): - return list(_coerce_to_tuple(obj)) From 07cd05237fade5f13112f46daf1b7b3ec66c6e41 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 20 Feb 2021 10:23:42 -0500 Subject: [PATCH 053/138] Cleanup --- python/cudf/cudf/core/join/_join_helpers.py | 190 ++++++++------------ python/cudf/cudf/core/join/join.py | 124 +++++++------ python/cudf/cudf/tests/test_joining.py | 5 +- 3 files changed, 140 insertions(+), 179 deletions(-) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 68c3e33ac25..2b4c655f057 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -49,15 +49,69 @@ def get_numeric_index(self, obj): return obj.index.names.index(self.name) -def _coerce_to_tuple(obj): - if hasattr(obj, "__iter__") and not isinstance(obj, str): - return tuple(obj) - else: - return (obj,) +def _cast_join_keys(lcol, rcol, how): + # cast the keys lcol and rcol to a common dtype + ltype = lcol.dtype + rtype = rcol.dtype -def _coerce_to_list(obj): - return list(_coerce_to_tuple(obj)) + # if either side is categorical, different logic + if isinstance(ltype, CategoricalDtype) or isinstance( + rtype, CategoricalDtype + ): + return _cast_join_categorical_keys(lcol, rcol, how) + + if pd.api.types.is_dtype_equal(ltype, rtype): + return ltype + + if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): + common_type = ( + max(ltype, rtype) + if ltype.kind == rtype.kind + else np.find_common_type([], (ltype, rtype)) + ) + + elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( + rtype, np.datetime64 + ): + common_type = max(ltype, rtype) + + if how == "left": + if rcol.fillna(0).can_cast_safely(ltype): + return ltype + else: + warnings.warn( + f"Can't safely cast column from {rtype} to {ltype}, " + "upcasting to {common_type}." + ) + + if common_type: + return common_type + + return None + + +def _cast_join_categorical_keys(lcol, rcol, how): + # cast the keys lcol and rcol to a common dtype + # when at least one of them is a categorical type + + l_is_cat = isinstance(lcol.dtype, CategoricalDtype) + r_is_cat = isinstance(rcol.dtype, CategoricalDtype) + + if l_is_cat and r_is_cat: + # if both are categoricals, logic is complicated: + return _cast_join_categorical_keys_both(lcol, rcol, how) + elif l_is_cat or r_is_cat: + if l_is_cat and how in {"left", "leftsemi", "leftanti"}: + return lcol.dtype + common_type = ( + lcol.dtype.categories.dtype + if l_is_cat + else rcol.dtype.categories.dtype + ) + return common_type + else: + raise ValueError("Neither operand is categorical") def _cast_join_categorical_keys_both(lcol, rcol, how): @@ -72,7 +126,7 @@ def _cast_join_categorical_keys_both(lcol, rcol, how): # when both are ordered and both have the same categories, # no casting required: if ltype == rtype: - return lcol, rcol + return ltype # Merging categorical variables when only one side is ordered is # ambiguous and not allowed. @@ -96,11 +150,11 @@ def _cast_join_categorical_keys_both(lcol, rcol, how): assert not ltype.ordered and not rtype.ordered if how == "inner": - # demote to underlying types -- we will promote them back later + # cast to category types -- we must cast them back later return _cast_join_keys(ltype.categories, rtype.categories, how) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type - return lcol, rcol.astype(ltype) + return ltype else: # merge categories merged_categories = cudf.concat( @@ -109,117 +163,15 @@ def _cast_join_categorical_keys_both(lcol, rcol, how): common_type = cudf.CategoricalDtype( categories=merged_categories, ordered=False ) - return lcol.astype(common_type), rcol.astype(common_type) + return common_type -def _cast_join_categorical_keys(lcol, rcol, how): - # cast the keys lcol and rcol to a common dtype - # when at least one of them is a categorical type - - l_is_cat = isinstance(lcol.dtype, CategoricalDtype) - r_is_cat = isinstance(rcol.dtype, CategoricalDtype) - - if l_is_cat and r_is_cat: - # if both are categoricals, logic is complicated: - return _cast_join_categorical_keys_both(lcol, rcol, how) - elif l_is_cat or r_is_cat: - if l_is_cat and how in {"left", "leftsemi", "leftanti"}: - return (lcol, rcol.astype(lcol.dtype)) - common_type = ( - lcol.dtype.categories.dtype - if l_is_cat - else rcol.dtype.categories.dtype - ) - return lcol.astype(common_type), rcol.astype(common_type) +def _coerce_to_tuple(obj): + if hasattr(obj, "__iter__") and not isinstance(obj, str): + return tuple(obj) else: - raise ValueError("Neither operand is categorical") - - -def _cast_join_keys(lcol, rcol, how): - # cast the keys lcol and rcol to a common dtype - - ltype = lcol.dtype - rtype = rcol.dtype - - # if either side is categorical, different logic - if isinstance(ltype, CategoricalDtype) or isinstance( - rtype, CategoricalDtype - ): - return _cast_join_categorical_keys(lcol, rcol, how) - - if pd.api.types.is_dtype_equal(ltype, rtype): - return lcol, rcol - - if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): - common_type = ( - max(ltype, type) - if ltype.kind == rtype.kind - else np.find_common_type([], (ltype, rtype)) - ) - - elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( - rtype, np.datetime64 - ): - common_type = max(ltype, rtype) - - if how == "left": - if rcol.fillna(0).can_cast_safely(ltype): - return lcol, rcol.astype(ltype) - else: - warnings.warn( - "Can't safely cast column from {rtype} to {ltype}, " - "upcasting to {common_type}." - ) - - if common_type: - lcol, rcol = lcol.astype(common_type), rcol.astype(common_type) - - return lcol, rcol - - -def _libcudf_to_output_castrules(lcol, rcol, how): - """ - Determine what dtype an output merge key column should be - cast to after it has been processed by libcudf. Determine - if a column should be promoted to a categorical datatype. - For inner merges between unordered categoricals, we get a - new categorical variable containing the intersection of - the two source variables. For left or right joins, we get - the original categorical variable from whichever was the - major operand of the join, e.g. left for a left join or - right for a right join. In the case of an outer join, the - result will be a new categorical variable with both sets - of categories. - """ - merge_return_type = None - - ltype = lcol.dtype - rtype = rcol.dtype - - if pd.api.types.is_dtype_equal(ltype, rtype): - return ltype - - merge_return_type = _cast_join_keys(lcol, rcol, how) + return (obj,) - l_is_cat = isinstance(ltype, CategoricalDtype) - r_is_cat = isinstance(rtype, CategoricalDtype) - # we currently only need to do this for categorical variables - if how == "inner": - if l_is_cat and r_is_cat: - merge_return_type = "category" - elif how in {"left", "leftsemi", "leftanti"}: - if l_is_cat: - merge_return_type = ltype - elif how == "right": - if r_is_cat: - merge_return_type = rtype - elif how == "outer": - if l_is_cat and r_is_cat: - new_cats = cudf.concat( - [ltype.categories, rtype.categories] - ).unique() - merge_return_type = cudf.CategoricalDtype( - categories=new_cats, ordered=ltype.ordered - ) - return merge_return_type +def _coerce_to_list(obj): + return list(_coerce_to_tuple(obj)) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index dcb06d3aee2..41265ba6af5 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -8,7 +8,6 @@ _coerce_to_list, _coerce_to_tuple, _Indexer, - _libcudf_to_output_castrules, ) @@ -47,6 +46,7 @@ def merge( rsuffix=rsuffix, method=method, indicator=indicator, + suffixes=suffixes, ) return mergeobj.perform_merge() @@ -59,18 +59,18 @@ def __init__( lhs, rhs, *, - on=None, - left_on=None, - right_on=None, - left_index=False, - right_index=False, - how="inner", - sort=False, - lsuffix="_x", - rsuffix="_y", - method=None, - indicator=None, - suffixes=None, + on, + left_on, + right_on, + left_index, + right_index, + how, + sort, + lsuffix, + rsuffix, + method, + indicator, + suffixes, ): """ Manage the merging of two Frames. @@ -197,7 +197,7 @@ def compute_join_keys(self): on_names = ( _coerce_to_tuple(self.on) if self.on is not None - else set(self.lhs._data.keys()) & set(self.rhs._data.keys()) + else set(self.lhs._data) & set(self.rhs._data) ) left_keys = [_Indexer(name=on, column=True) for on in on_names] right_keys = [_Indexer(name=on, column=True) for on in on_names] @@ -225,13 +225,10 @@ def perform_merge(self): right_on=right_key_indices, how=self.how, ) + lhs, rhs = self.restore_categorical_keys(lhs, rhs) return self.construct_result(lhs, rhs, left_rows, right_rows) def construct_result(self, lhs, rhs, left_rows, right_rows): - lhs, rhs = self.match_key_dtypes( - self.lhs, self.rhs, _libcudf_to_output_castrules - ) - # first construct the index. if self.left_index and self.right_index: if self.how == "right": @@ -274,53 +271,44 @@ def construct_result(self, lhs, rhs, left_rows, right_rows): ), ) - return self.sort_result(result) + if self.sort: + result = self.sort_result(result) + return result def sort_result(self, result): - # If sort=True, Pandas sorts on the key columns in the + # Pandas sorts on the key columns in the # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. - if self.sort: - if self.on: - if isinstance(result, cudf.Index): - return result.sort_values() - else: - return result.sort_values( - _coerce_to_list(self.on), ignore_index=True - ) - by = [] - if self.left_index and self.right_index: - by.extend(result.index._data.columns) - if self.left_on: - by.extend( - [ - result._data[col] - for col in _coerce_to_list(self.left_on) - ] - ) - if self.right_on: - by.extend( - [ - result._data[col] - for col in _coerce_to_list(self.right_on) - ] + if self.on: + if isinstance(result, cudf.Index): + return result.sort_values() + else: + return result.sort_values( + _coerce_to_list(self.on), ignore_index=True ) - if by: - to_sort = cudf.DataFrame._from_columns(by) - sort_order = to_sort.argsort() - result = result.take(sort_order) + by = [] + if self.left_index and self.right_index: + by.extend(result.index._data.columns) + if self.left_on: + by.extend( + [result._data[col] for col in _coerce_to_list(self.left_on)] + ) + if self.right_on: + by.extend( + [result._data[col] for col in _coerce_to_list(self.right_on)] + ) + if by: + to_sort = cudf.DataFrame._from_columns(by) + sort_order = to_sort.argsort() + result = result.take(sort_order) return result def output_column_names(self): # Return mappings of input column names to (possibly) suffixed # result column names - left_names = OrderedDict( - zip(self.lhs._data.keys(), self.lhs._data.keys()) - ) - right_names = OrderedDict( - zip(self.rhs._data.keys(), self.rhs._data.keys()) - ) + left_names = OrderedDict(zip(self.lhs._data, self.lhs._data)) + right_names = OrderedDict(zip(self.rhs._data, self.rhs._data)) common_names = set(left_names) & set(right_names) if self.on: @@ -328,7 +316,7 @@ def output_column_names(self): else: key_columns_with_same_name = [] for lkey, rkey in zip(*self._keys): - if (lkey.index, rkey.index) == (False, False,): + if (lkey.index, rkey.index) == (False, False): if lkey.name == rkey.name: key_columns_with_same_name.append(lkey.name) for name in common_names: @@ -375,7 +363,7 @@ def validate_merge_params( raise ValueError("Can not merge on unnamed Series") # If nothing specified, must have common cols to use implicitly - same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys()) + same_named_columns = set(lhs._data) & set(rhs._data) if ( not (left_index or right_index) and not (left_on or right_on) @@ -414,6 +402,30 @@ def match_key_dtypes(self, lhs, rhs, match_func): right_key.set_value(out_rhs, rcol.astype(dtype)) return out_lhs, out_rhs + def restore_categorical_keys(self, lhs, rhs): + # For inner joins, any categorical keys were casted + # to the type of their categories. + # Here, we cast the keys back to categorical type + # before constructing the result + + out_lhs = lhs.copy(deep=False) + out_rhs = rhs.copy(deep=False) + + if self.how == "inner": + for left_key, right_key in zip(*self._keys): + if isinstance( + left_key.value(self.lhs).dtype, cudf.CategoricalDtype + ) and isinstance( + right_key.value(self.rhs).dtype, cudf.CategoricalDtype + ): + left_key.set_value( + out_lhs, left_key.value(out_lhs).astype("category") + ) + right_key.set_value( + out_rhs, right_key.value(out_rhs).astype("category") + ) + return out_lhs, out_rhs + class MergeSemi(Merge): def perform_merge(self): diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 367e903d02e..078619afcaf 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1147,10 +1147,7 @@ def test_typecast_on_join_overflow_unsafe(dtypes): with pytest.warns( UserWarning, - match=( - f"can't safely cast column" - f" from right with type {dtype_r} to {dtype_l}" - ), + match=(f"Can't safely cast column" f" from {dtype_r} to {dtype_l}"), ): merged = lhs.merge(rhs, on="a", how="left") # noqa: F841 From bd6bf771fcafd1968fced6fcd6c27251825d3a1d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 20 Feb 2021 10:29:45 -0500 Subject: [PATCH 054/138] Warnings --- python/cudf/cudf/tests/test_joining.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 078619afcaf..42c90998e7f 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -850,8 +850,8 @@ def test_join_empty_table_dtype(): "column_a", [ ( - pd.Series([None, 1, 2, 3, 4, 5, 6, 7]).astype(np.float), - pd.Series([8, 9, 10, 11, 12, None, 14, 15]).astype(np.float), + pd.Series([None, 1, 2, 3, 4, 5, 6, 7]).astype(np.float64), + pd.Series([8, 9, 10, 11, 12, None, 14, 15]).astype(np.float64), ) ], ) @@ -859,8 +859,8 @@ def test_join_empty_table_dtype(): "column_b", [ ( - pd.Series([0, 1, 0, None, 1, 0, 0, 0]).astype(np.float), - pd.Series([None, 1, 2, 1, 2, 2, 0, 0]).astype(np.float), + pd.Series([0, 1, 0, None, 1, 0, 0, 0]).astype(np.float64), + pd.Series([None, 1, 2, 1, 2, 2, 0, 0]).astype(np.float64), ) ], ) From a40063e70fb5339829833403472b202b5f6255e1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 10:25:01 -0500 Subject: [PATCH 055/138] Cleanup --- python/cudf/cudf/core/join/join.py | 131 +++++++++++++---------------- 1 file changed, 59 insertions(+), 72 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 41265ba6af5..71a7b716775 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,5 +1,8 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +from __future__ import annotations + from collections import OrderedDict, namedtuple +from typing import TYPE_CHECKING import cudf from cudf import _lib as libcudf @@ -10,6 +13,9 @@ _Indexer, ) +if TYPE_CHECKING: + from cudf.core.frame import Frame + def merge( lhs, @@ -53,6 +59,7 @@ def merge( class Merge(object): JoinKeys = namedtuple("JoinKeys", ["left", "right"]) + _joiner = libcudf.join.join def __init__( self, @@ -218,7 +225,7 @@ def perform_merge(self): right_key_indices = [ key.get_numeric_index(rhs) for key in self._keys.right ] - left_rows, right_rows = libcudf.join.join( + left_rows, right_rows = self._joiner( lhs, rhs, left_on=left_key_indices, @@ -229,35 +236,60 @@ def perform_merge(self): return self.construct_result(lhs, rhs, left_rows, right_rows) def construct_result(self, lhs, rhs, left_rows, right_rows): - # first construct the index. + lhs = lhs._gather(left_rows, nullify=True) + rhs = rhs._gather(right_rows, nullify=True) + + result = self.merge_results(lhs, rhs) + + if self.sort: + result = self.sort_result(result) + return result + + def merge_results(self, lhs: Frame, rhs: Frame) -> Frame: + lnames = OrderedDict(zip(lhs._data, lhs._data)) + rnames = OrderedDict(zip(rhs._data, rhs._data)) + common_names = set(lnames) & set(rnames) + + if self.on: + key_columns_with_same_name = self.on + else: + key_columns_with_same_name = [] + for lkey, rkey in zip(*self._keys): + if (lkey.index, rkey.index) == (False, False): + if lkey.name == rkey.name: + key_columns_with_same_name.append(lkey.name) + + for name in common_names: + if name not in key_columns_with_same_name: + lnames[name] = f"{name}{self.lsuffix}" + rnames[name] = f"{name}{self.rsuffix}" + else: + del rnames[name] + + # now construct the data: + data = cudf.core.column_accessor.ColumnAccessor() + + for lcol in lnames: + data[lnames[lcol]] = lhs._data[lcol] + for rcol in rnames: + data[rnames[rcol]] = rhs._data[rcol] + + # drop the index we won't be using: if self.left_index and self.right_index: if self.how == "right": - out_index = rhs.index._gather(left_rows, nullify=True) + index = rhs._index else: - out_index = lhs.index._gather(left_rows, nullify=True) + index = lhs._index elif self.left_index: # left_index and right_on - out_index = rhs.index._gather(right_rows, nullify=True) + index = rhs._index elif self.right_index: # right_index and left_on - out_index = lhs.index._gather(left_rows, nullify=True) + index = lhs._index else: - out_index = None - - # now construct the data: - data = cudf.core.column_accessor.ColumnAccessor() - left_names, right_names = self.output_column_names() - - for lcol in left_names: - data[left_names[lcol]] = lhs._data[lcol].take( - left_rows, nullify=True - ) - for rcol in right_names: - data[right_names[rcol]] = rhs._data[rcol].take( - right_rows, nullify=True - ) + index = None - result = self.out_class._from_data(data, index=out_index) + result = self.out_class._from_data(data=data, index=index) # if outer join, key columns with the same name are combined: if self.how == "outer": @@ -265,14 +297,9 @@ def construct_result(self, lhs, rhs, left_rows, right_rows): if lkey.name == rkey.name: # fill nulls in the key column with values from the RHS lkey.set_value( - result, - lkey.value(result).fillna( - rkey.value(rhs).take(right_rows, nullify=True) - ), + result, lkey.value(result).fillna(rkey.value(rhs)), ) - if self.sort: - result = self.sort_result(result) return result def sort_result(self, result): @@ -304,29 +331,6 @@ def sort_result(self, result): result = result.take(sort_order) return result - def output_column_names(self): - # Return mappings of input column names to (possibly) suffixed - # result column names - left_names = OrderedDict(zip(self.lhs._data, self.lhs._data)) - right_names = OrderedDict(zip(self.rhs._data, self.rhs._data)) - common_names = set(left_names) & set(right_names) - - if self.on: - key_columns_with_same_name = self.on - else: - key_columns_with_same_name = [] - for lkey, rkey in zip(*self._keys): - if (lkey.index, rkey.index) == (False, False): - if lkey.name == rkey.name: - key_columns_with_same_name.append(lkey.name) - for name in common_names: - if name not in key_columns_with_same_name: - left_names[name] = f"{name}{self.lsuffix}" - right_names[name] = f"{name}{self.rsuffix}" - else: - del right_names[name] - return left_names, right_names - @staticmethod def validate_merge_params( lhs, @@ -428,26 +432,9 @@ def restore_categorical_keys(self, lhs, rhs): class MergeSemi(Merge): - def perform_merge(self): - lhs, rhs = self.match_key_dtypes(self.lhs, self.rhs, _cast_join_keys) - - left_key_indices = [ - key.get_numeric_index(lhs) for key in self._keys.left - ] - right_key_indices = [ - key.get_numeric_index(rhs) for key in self._keys.right - ] - left_rows = libcudf.join.semi_join( - lhs, - rhs, - left_on=left_key_indices, - right_on=right_key_indices, - how=self.how, - ) - return self.construct_result( - lhs, rhs, left_rows, cudf.core.column.as_column([]) - ) + def _joiner(self, lhs, rhs, left_on, right_on, how): + left_rows = libcudf.join.semi_join(lhs, rhs, left_on, right_on, how) + return left_rows, cudf.core.column.as_column([], dtype="int32") - def output_column_names(self): - left_names, _ = super().output_column_names() - return left_names, {} + def merge_results(self, lhs, rhs): + return super().merge_results(lhs, cudf.core.frame.Frame()) From ccef9d09998959d85c99f6afa1d18cef9bd20883 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 11:27:13 -0500 Subject: [PATCH 056/138] Cleanup --- python/cudf/cudf/core/join/_join_helpers.py | 12 +-- python/cudf/cudf/core/join/join.py | 107 ++++++++++---------- 2 files changed, 59 insertions(+), 60 deletions(-) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 2b4c655f057..4d5bc1dd1c0 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -49,7 +49,7 @@ def get_numeric_index(self, obj): return obj.index.names.index(self.name) -def _cast_join_keys(lcol, rcol, how): +def _match_join_keys(lcol, rcol, how): # cast the keys lcol and rcol to a common dtype ltype = lcol.dtype @@ -59,7 +59,7 @@ def _cast_join_keys(lcol, rcol, how): if isinstance(ltype, CategoricalDtype) or isinstance( rtype, CategoricalDtype ): - return _cast_join_categorical_keys(lcol, rcol, how) + return _match_join_categorical_keys(lcol, rcol, how) if pd.api.types.is_dtype_equal(ltype, rtype): return ltype @@ -91,7 +91,7 @@ def _cast_join_keys(lcol, rcol, how): return None -def _cast_join_categorical_keys(lcol, rcol, how): +def _match_join_categorical_keys(lcol, rcol, how): # cast the keys lcol and rcol to a common dtype # when at least one of them is a categorical type @@ -100,7 +100,7 @@ def _cast_join_categorical_keys(lcol, rcol, how): if l_is_cat and r_is_cat: # if both are categoricals, logic is complicated: - return _cast_join_categorical_keys_both(lcol, rcol, how) + return _match_join_categorical_keys_both(lcol, rcol, how) elif l_is_cat or r_is_cat: if l_is_cat and how in {"left", "leftsemi", "leftanti"}: return lcol.dtype @@ -114,7 +114,7 @@ def _cast_join_categorical_keys(lcol, rcol, how): raise ValueError("Neither operand is categorical") -def _cast_join_categorical_keys_both(lcol, rcol, how): +def _match_join_categorical_keys_both(lcol, rcol, how): # cast lcol and rcol to a common type when they are *both* # categorical types. # @@ -151,7 +151,7 @@ def _cast_join_categorical_keys_both(lcol, rcol, how): if how == "inner": # cast to category types -- we must cast them back later - return _cast_join_keys(ltype.categories, rtype.categories, how) + return _match_join_keys(ltype.categories, rtype.categories, how) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type return ltype diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 71a7b716775..d2505016f2c 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -7,10 +7,10 @@ import cudf from cudf import _lib as libcudf from cudf.core.join._join_helpers import ( - _cast_join_keys, _coerce_to_list, _coerce_to_tuple, _Indexer, + _match_join_keys, ) if TYPE_CHECKING: @@ -121,7 +121,7 @@ def __init__( Left and right suffixes specified together, unpacked into lsuffix and rsuffix. """ - self.validate_merge_params( + self._validate_merge_params( lhs, rhs, on=on, @@ -157,9 +157,37 @@ def __init__( elif isinstance(self.lhs, cudf.Index): self.out_class = self.lhs.__class__ - self.compute_join_keys() + self._compute_join_keys() - def compute_join_keys(self): + def perform_merge(self): + lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs) + + left_key_indices = [ + key.get_numeric_index(lhs) for key in self._keys.left + ] + right_key_indices = [ + key.get_numeric_index(rhs) for key in self._keys.right + ] + + left_rows, right_rows = self._joiner( + lhs, + rhs, + left_on=left_key_indices, + right_on=right_key_indices, + how=self.how, + ) + lhs, rhs = self._restore_categorical_keys(lhs, rhs) + + left_result = lhs._gather(left_rows, nullify=True) + right_result = rhs._gather(right_rows, nullify=True) + + result = self._merge_results(left_result, right_result) + + if self.sort: + result = self._sort_result(result) + return result + + def _compute_join_keys(self): if ( self.left_index or self.right_index @@ -216,38 +244,11 @@ def compute_join_keys(self): self._keys = self.__class__.JoinKeys(left=left_keys, right=right_keys) - def perform_merge(self): - lhs, rhs = self.match_key_dtypes(self.lhs, self.rhs, _cast_join_keys) - - left_key_indices = [ - key.get_numeric_index(lhs) for key in self._keys.left - ] - right_key_indices = [ - key.get_numeric_index(rhs) for key in self._keys.right - ] - left_rows, right_rows = self._joiner( - lhs, - rhs, - left_on=left_key_indices, - right_on=right_key_indices, - how=self.how, - ) - lhs, rhs = self.restore_categorical_keys(lhs, rhs) - return self.construct_result(lhs, rhs, left_rows, right_rows) - - def construct_result(self, lhs, rhs, left_rows, right_rows): - lhs = lhs._gather(left_rows, nullify=True) - rhs = rhs._gather(right_rows, nullify=True) - - result = self.merge_results(lhs, rhs) - - if self.sort: - result = self.sort_result(result) - return result + def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: + # merge the left result and right result into a single Frame - def merge_results(self, lhs: Frame, rhs: Frame) -> Frame: - lnames = OrderedDict(zip(lhs._data, lhs._data)) - rnames = OrderedDict(zip(rhs._data, rhs._data)) + lnames = OrderedDict(zip(left_result._data, left_result._data)) + rnames = OrderedDict(zip(right_result._data, right_result._data)) common_names = set(lnames) & set(rnames) if self.on: @@ -270,22 +271,22 @@ def merge_results(self, lhs: Frame, rhs: Frame) -> Frame: data = cudf.core.column_accessor.ColumnAccessor() for lcol in lnames: - data[lnames[lcol]] = lhs._data[lcol] + data[lnames[lcol]] = left_result._data[lcol] for rcol in rnames: - data[rnames[rcol]] = rhs._data[rcol] + data[rnames[rcol]] = right_result._data[rcol] # drop the index we won't be using: if self.left_index and self.right_index: if self.how == "right": - index = rhs._index + index = right_result._index else: - index = lhs._index + index = left_result._index elif self.left_index: # left_index and right_on - index = rhs._index + index = right_result._index elif self.right_index: # right_index and left_on - index = lhs._index + index = left_result._index else: index = None @@ -297,12 +298,13 @@ def merge_results(self, lhs: Frame, rhs: Frame) -> Frame: if lkey.name == rkey.name: # fill nulls in the key column with values from the RHS lkey.set_value( - result, lkey.value(result).fillna(rkey.value(rhs)), + result, + lkey.value(result).fillna(rkey.value(right_result)), ) return result - def sort_result(self, result): + def _sort_result(self, result): # Pandas sorts on the key columns in the # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, @@ -332,7 +334,7 @@ def sort_result(self, result): return result @staticmethod - def validate_merge_params( + def _validate_merge_params( lhs, rhs, on, @@ -392,25 +394,22 @@ def validate_merge_params( "lsuffix and rsuffix are not defined" ) - def match_key_dtypes(self, lhs, rhs, match_func): + def _match_key_dtypes(self, lhs, rhs): + # Match the dtypes of the key columns from lhs and rhs out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) - # match the dtypes of the key columns in - # self.lhs and self.rhs according to the matching - # function `match_func` for left_key, right_key in zip(*self._keys): lcol, rcol = left_key.value(lhs), right_key.value(rhs) - dtype = match_func(lcol, rcol, how=self.how) + dtype = _match_join_keys(lcol, rcol, how=self.how) if dtype: left_key.set_value(out_lhs, lcol.astype(dtype)) right_key.set_value(out_rhs, rcol.astype(dtype)) return out_lhs, out_rhs - def restore_categorical_keys(self, lhs, rhs): + def _restore_categorical_keys(self, lhs, rhs): # For inner joins, any categorical keys were casted # to the type of their categories. - # Here, we cast the keys back to categorical type - # before constructing the result + # Here, we cast the keys back to categorical type. out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) @@ -436,5 +435,5 @@ def _joiner(self, lhs, rhs, left_on, right_on, how): left_rows = libcudf.join.semi_join(lhs, rhs, left_on, right_on, how) return left_rows, cudf.core.column.as_column([], dtype="int32") - def merge_results(self, lhs, rhs): - return super().merge_results(lhs, cudf.core.frame.Frame()) + def _merge_results(self, lhs, rhs): + return super()._merge_results(lhs, cudf.core.frame.Frame()) From 210244b89ec30a0bb3f86f3a7c079262364cc2c9 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 12:10:07 -0500 Subject: [PATCH 057/138] Cleanup --- python/cudf/cudf/_lib/join.pyx | 4 +- python/cudf/cudf/core/join/_join_helpers.py | 8 +- python/cudf/cudf/core/join/join.py | 119 ++++++++++++-------- 3 files changed, 79 insertions(+), 52 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 20c14f00957..0339d86384d 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -51,6 +51,8 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): cpdef semi_join(Table lhs, Table rhs, left_on, right_on, how=None): + from cudf.core.column import as_column + # left-semi and left-anti joins cdef vector[size_type] c_left_on = left_on cdef vector[size_type] c_right_on = right_on @@ -70,4 +72,4 @@ cpdef semi_join(Table lhs, Table rhs, left_on, right_on, how=None): )) else: raise ValueError(f"Invalid join type {how}") - return Column.from_unique_ptr(move(c_result)) + return Column.from_unique_ptr(move(c_result)), as_column([], dtype="int32") diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 4d5bc1dd1c0..f7a8622c80a 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -18,21 +18,21 @@ class _Indexer: # 4 1 # 5 2 # 6 3 - # >>> _Indexer("a", column=True).value(df) # returns column "a" of df - # >>> _Indexer("b", index=True).value(df) # returns index level "b" of df + # >>> _Indexer("a", column=True).get(df) # returns column "a" of df + # >>> _Indexer("b", index=True).get(df) # returns index level "b" of df def __init__(self, name, column=False, index=False): self.name = name self.column, self.index = column, index - def value(self, obj): + def get(self, obj): # get the column from `obj` if self.column: return obj._data[self.name] else: return obj._index._data[self.name] - def set_value(self, obj, value): + def set(self, obj, value): # set the colum in `obj` if self.column: obj._data[self.name] = value diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index d2505016f2c..0d53184ced5 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -57,8 +57,26 @@ def merge( return mergeobj.perform_merge() +_JoinKeys = namedtuple("JoinKeys", ["left", "right"]) + + class Merge(object): - JoinKeys = namedtuple("JoinKeys", ["left", "right"]) + # A namedtuple of indexers representing the left and right keys + _keys: _JoinKeys + + # The joiner function must have the following signature: + # + # def joiner(lhs, rhs, left_on, right_on, how=how): + # ... + # + # Where: + # + # - `lhs` and `rhs` represent the left and right Frames to join + # - `left_on` and `right_on` represent the *numerical* indices + # of the key columns of lhs and rhs. This allows specifying + # index levels as keys in an unambiguous way. + # - `how` is a string specifying the kind of join to perform + # (useful if the joiner function can perform more than one join). _joiner = libcudf.join.join def __init__( @@ -148,14 +166,14 @@ def __init__( self.rsuffix = rsuffix self.suffixes = suffixes - self.out_class = cudf.DataFrame + self._out_class = cudf.DataFrame if isinstance(self.lhs, cudf.MultiIndex) or isinstance( self.rhs, cudf.MultiIndex ): - self.out_class = cudf.MultiIndex + self._out_class = cudf.MultiIndex elif isinstance(self.lhs, cudf.Index): - self.out_class = self.lhs.__class__ + self._out_class = self.lhs.__class__ self._compute_join_keys() @@ -188,6 +206,7 @@ def perform_merge(self): return result def _compute_join_keys(self): + # Computes self._keys if ( self.left_index or self.right_index @@ -242,14 +261,31 @@ def _compute_join_keys(self): "Merge operands must have same number of join key columns" ) - self._keys = self.__class__.JoinKeys(left=left_keys, right=right_keys) + self._keys = _JoinKeys(left=left_keys, right=right_keys) def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: - # merge the left result and right result into a single Frame + # Merge the Frames `left_result` and `right_result` into a single + # `Frame`, suffixing column names if necessary. + + # For outer joins, the key columns from left_result and + # right_result are combined if they have the same name. + # We will drop those keys from right_result later, so + # combine them now with keys from left_result. + if self.how == "outer": + for lkey, rkey in zip(*self._keys): + if lkey.name == rkey.name: + # fill nulls in lhs from values in the rhs + lkey.set( + left_result, + lkey.get(left_result).fillna(rkey.get(right_result)), + ) + + # `left_names` and `right_names` are mappings of column names + # of `lhs` and `rhs` to the corresponding column names in the result + left_names = OrderedDict(zip(left_result._data, left_result._data)) + right_names = OrderedDict(zip(right_result._data, right_result._data)) - lnames = OrderedDict(zip(left_result._data, left_result._data)) - rnames = OrderedDict(zip(right_result._data, right_result._data)) - common_names = set(lnames) & set(rnames) + common_names = set(left_names) & set(right_names) if self.on: key_columns_with_same_name = self.on @@ -260,22 +296,25 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: if lkey.name == rkey.name: key_columns_with_same_name.append(lkey.name) + # For any columns with the same name: + # - if they are key columns, keep only the left column + # - if they are not key columns, use suffixes for name in common_names: if name not in key_columns_with_same_name: - lnames[name] = f"{name}{self.lsuffix}" - rnames[name] = f"{name}{self.rsuffix}" + left_names[name] = f"{name}{self.lsuffix}" + right_names[name] = f"{name}{self.rsuffix}" else: - del rnames[name] + del right_names[name] - # now construct the data: + # Assemble the data columns of the result: data = cudf.core.column_accessor.ColumnAccessor() - for lcol in lnames: - data[lnames[lcol]] = left_result._data[lcol] - for rcol in rnames: - data[rnames[rcol]] = right_result._data[rcol] + for lcol in left_names: + data[left_names[lcol]] = left_result._data[lcol] + for rcol in right_names: + data[right_names[rcol]] = right_result._data[rcol] - # drop the index we won't be using: + # Index of the result: if self.left_index and self.right_index: if self.how == "right": index = right_result._index @@ -290,17 +329,8 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: else: index = None - result = self.out_class._from_data(data=data, index=index) - - # if outer join, key columns with the same name are combined: - if self.how == "outer": - for lkey, rkey in zip(*self._keys): - if lkey.name == rkey.name: - # fill nulls in the key column with values from the RHS - lkey.set_value( - result, - lkey.value(result).fillna(rkey.value(right_result)), - ) + # Construct result from data and index: + result = self._out_class._from_data(data=data, index=index) return result @@ -350,7 +380,6 @@ def _validate_merge_params( """ Error for various invalid combinations of merge input parameters """ - # must actually support the requested merge type if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}: raise NotImplementedError(f"{how} merge not supported yet") @@ -399,41 +428,37 @@ def _match_key_dtypes(self, lhs, rhs): out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) for left_key, right_key in zip(*self._keys): - lcol, rcol = left_key.value(lhs), right_key.value(rhs) + lcol, rcol = left_key.get(lhs), right_key.get(rhs) dtype = _match_join_keys(lcol, rcol, how=self.how) if dtype: - left_key.set_value(out_lhs, lcol.astype(dtype)) - right_key.set_value(out_rhs, rcol.astype(dtype)) + left_key.set(out_lhs, lcol.astype(dtype)) + right_key.set(out_rhs, rcol.astype(dtype)) return out_lhs, out_rhs def _restore_categorical_keys(self, lhs, rhs): - # For inner joins, any categorical keys were casted - # to the type of their categories. - # Here, we cast the keys back to categorical type. - + # For inner joins, any categorical keys in `self.lhs` and `self.rhs` + # were casted to their category type to produce `lhs` and `rhs`. + # Here, we cast them back. out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) - if self.how == "inner": for left_key, right_key in zip(*self._keys): if isinstance( - left_key.value(self.lhs).dtype, cudf.CategoricalDtype + left_key.get(self.lhs).dtype, cudf.CategoricalDtype ) and isinstance( - right_key.value(self.rhs).dtype, cudf.CategoricalDtype + right_key.get(self.rhs).dtype, cudf.CategoricalDtype ): - left_key.set_value( - out_lhs, left_key.value(out_lhs).astype("category") + left_key.set( + out_lhs, left_key.get(out_lhs).astype("category") ) - right_key.set_value( - out_rhs, right_key.value(out_rhs).astype("category") + right_key.set( + out_rhs, right_key.get(out_rhs).astype("category") ) return out_lhs, out_rhs class MergeSemi(Merge): - def _joiner(self, lhs, rhs, left_on, right_on, how): - left_rows = libcudf.join.semi_join(lhs, rhs, left_on, right_on, how) - return left_rows, cudf.core.column.as_column([], dtype="int32") + _joiner = libcudf.join.semi_join def _merge_results(self, lhs, rhs): return super()._merge_results(lhs, cudf.core.frame.Frame()) From b57348c88543a01a2ae618d375874924d7b07897 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 12:45:19 -0500 Subject: [PATCH 058/138] Add typing for join helpers --- python/cudf/cudf/core/column/column.py | 3 + python/cudf/cudf/core/index.py | 4 ++ python/cudf/cudf/core/join/_join_helpers.py | 75 +++++++++++---------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0f99395d919..8e8587b2dee 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -991,6 +991,9 @@ def distinct_count( raise NotImplementedError(msg) return cpp_distinct_count(self, ignore_nulls=dropna) + def can_cast_safely(self, to_dtype: Dtype) -> bool: + return False + def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 78d43d0275b..cfb0dc4238c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -12,6 +12,7 @@ from pandas._config import get_option import cudf +from cudf._typing import DtypeObj from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, @@ -65,6 +66,9 @@ def _to_frame(this_index, index=True, name=None): class Index(Frame, Serializable): + + dtype: DtypeObj + def __new__( cls, data=None, diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index f7a8622c80a..d4b25951de2 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -1,6 +1,8 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +from __future__ import annotations import warnings +from typing import TYPE_CHECKING, Any import numpy as np import pandas as pd @@ -8,6 +10,11 @@ import cudf from cudf.core.dtypes import CategoricalDtype +if TYPE_CHECKING: + from cudf._typing import Dtype + from cudf.core.column import ColumnBase + from cudf.core.frame import Frame + class _Indexer: # Indexer into a column (either a data column or index level). @@ -21,35 +28,38 @@ class _Indexer: # >>> _Indexer("a", column=True).get(df) # returns column "a" of df # >>> _Indexer("b", index=True).get(df) # returns index level "b" of df - def __init__(self, name, column=False, index=False): + def __init__(self, name: Any, column=False, index=False): self.name = name self.column, self.index = column, index - def get(self, obj): + def get(self, obj: Frame) -> ColumnBase: # get the column from `obj` if self.column: return obj._data[self.name] - else: + if obj._index is not None: return obj._index._data[self.name] + raise KeyError() - def set(self, obj, value): + def set(self, obj: Frame, value: ColumnBase): # set the colum in `obj` if self.column: obj._data[self.name] = value - else: + if obj._index is not None: obj._index._data[self.name] = value + raise KeyError() - def get_numeric_index(self, obj): + def get_numeric_index(self, obj: Frame) -> int: # get the position of the column in `obj` # (counting any index columns) if self.column: - index_nlevels = obj.index.nlevels if obj._index is not None else 0 + index_nlevels = obj._index.nlevels if obj._index is not None else 0 return index_nlevels + tuple(obj._data).index(self.name) - else: - return obj.index.names.index(self.name) + if obj._index is not None: + return obj._index.names.index(self.name) + raise KeyError() -def _match_join_keys(lcol, rcol, how): +def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: # cast the keys lcol and rcol to a common dtype ltype = lcol.dtype @@ -59,7 +69,7 @@ def _match_join_keys(lcol, rcol, how): if isinstance(ltype, CategoricalDtype) or isinstance( rtype, CategoricalDtype ): - return _match_join_categorical_keys(lcol, rcol, how) + return _match_categorical_dtypes(ltype, rtype, how) if pd.api.types.is_dtype_equal(ltype, rtype): return ltype @@ -91,38 +101,31 @@ def _match_join_keys(lcol, rcol, how): return None -def _match_join_categorical_keys(lcol, rcol, how): +def _match_categorical_dtypes(ltype: Dtype, rtype: Dtype, how: str) -> Dtype: # cast the keys lcol and rcol to a common dtype # when at least one of them is a categorical type - l_is_cat = isinstance(lcol.dtype, CategoricalDtype) - r_is_cat = isinstance(rcol.dtype, CategoricalDtype) - - if l_is_cat and r_is_cat: + if isinstance(ltype, CategoricalDtype) and isinstance( + rtype, CategoricalDtype + ): # if both are categoricals, logic is complicated: - return _match_join_categorical_keys_both(lcol, rcol, how) - elif l_is_cat or r_is_cat: - if l_is_cat and how in {"left", "leftsemi", "leftanti"}: - return lcol.dtype - common_type = ( - lcol.dtype.categories.dtype - if l_is_cat - else rcol.dtype.categories.dtype - ) - return common_type - else: - raise ValueError("Neither operand is categorical") + return _match_categorical_dtypes_both(ltype, rtype, how) + + if isinstance(ltype, CategoricalDtype): + if how in {"left", "leftsemi", "leftanti"}: + return ltype + common_type = ltype.categories.dtype + elif isinstance(rtype, CategoricalDtype): + common_type = rtype.categories.dtype + return common_type -def _match_join_categorical_keys_both(lcol, rcol, how): - # cast lcol and rcol to a common type when they are *both* - # categorical types. - # +def _match_categorical_dtypes_both( + ltype: CategoricalDtype, rtype: CategoricalDtype, how: str +) -> Dtype: # The commontype depends on both `how` and the specifics of the # categorical variables to be merged. - ltype, rtype = lcol.dtype, rcol.dtype - # when both are ordered and both have the same categories, # no casting required: if ltype == rtype: @@ -151,7 +154,9 @@ def _match_join_categorical_keys_both(lcol, rcol, how): if how == "inner": # cast to category types -- we must cast them back later - return _match_join_keys(ltype.categories, rtype.categories, how) + return _match_join_keys( + ltype.categories._values, rtype.categories._values, how + ) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type return ltype From 5c2c9b368d713653b1d25fad198cc2bf65e98c0a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 13:52:18 -0500 Subject: [PATCH 059/138] Typing for Join class --- python/cudf/cudf/core/column/column.py | 4 +++- python/cudf/cudf/core/column/numerical.py | 4 +++- python/cudf/cudf/core/frame.py | 17 ++++++++++---- python/cudf/cudf/core/join/_join_helpers.py | 20 +++++++++------- python/cudf/cudf/core/join/join.py | 26 +++++++++++---------- 5 files changed, 45 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8e8587b2dee..65e2a142992 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -979,7 +979,9 @@ def sort_by_values( ascending: bool = True, na_position: builtins.str = "last", ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]: - col_inds = self.as_frame()._get_sorted_inds(ascending, na_position) + col_inds = self.as_frame()._get_sorted_inds( + ascending=ascending, na_position=na_position + ) col_keys = self.take(col_inds) return col_keys, col_inds diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0a8d93c913b..3add003efc1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -333,7 +333,9 @@ def _numeric_quantile( ) -> NumericalColumn: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls - sorted_indices = self.as_frame()._get_sorted_inds(True, "first") + sorted_indices = self.as_frame()._get_sorted_inds( + ascending=True, na_position="first" + ) sorted_indices = sorted_indices[self.null_count :] return cpp_quantile(self, quant, interpolation, sorted_indices, exact) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1dfb65ace38..ce554a6c3b9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -569,7 +569,7 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) - def _get_columns_by_label(self, labels, downcast): + def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -2720,12 +2720,15 @@ def searchsorted( else: return result - def _get_sorted_inds(self, ascending=True, na_position="last"): + def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): """ Sort by the values. Parameters ---------- + by: list, optional + Labels specifyin columns to sort by. By default, + sort by all columns of `self` ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ @@ -2760,11 +2763,17 @@ def _get_sorted_inds(self, ascending=True, na_position="last"): ) na_position = 0 + to_sort = ( + self + if by is None + else self._get_columns_by_label(by, downcast=False) + ) + # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): - ascending = [ascending] * self._num_columns + ascending = [ascending] * to_sort._num_columns - return libcudf.sort.order_by(self, ascending, na_position) + return libcudf.sort.order_by(to_sort, ascending, na_position) def sin(self): """ diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index d4b25951de2..a499d3d33de 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -36,17 +36,20 @@ def get(self, obj: Frame) -> ColumnBase: # get the column from `obj` if self.column: return obj._data[self.name] - if obj._index is not None: - return obj._index._data[self.name] + else: + if obj._index is not None: + return obj._index._data[self.name] raise KeyError() def set(self, obj: Frame, value: ColumnBase): # set the colum in `obj` if self.column: obj._data[self.name] = value - if obj._index is not None: - obj._index._data[self.name] = value - raise KeyError() + else: + if obj._index is not None: + obj._index._data[self.name] = value + else: + raise KeyError() def get_numeric_index(self, obj: Frame) -> int: # get the position of the column in `obj` @@ -54,9 +57,10 @@ def get_numeric_index(self, obj: Frame) -> int: if self.column: index_nlevels = obj._index.nlevels if obj._index is not None else 0 return index_nlevels + tuple(obj._data).index(self.name) - if obj._index is not None: - return obj._index.names.index(self.name) - raise KeyError() + else: + if obj._index is not None: + return obj._index.names.index(self.name) + raise KeyError() def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0d53184ced5..61fe6bfc082 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,7 +2,7 @@ from __future__ import annotations from collections import OrderedDict, namedtuple -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Tuple import cudf from cudf import _lib as libcudf @@ -177,7 +177,7 @@ def __init__( self._compute_join_keys() - def perform_merge(self): + def perform_merge(self) -> Frame: lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs) left_key_indices = [ @@ -334,21 +334,21 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: return result - def _sort_result(self, result): + def _sort_result(self, result: Frame) -> Frame: # Pandas sorts on the key columns in the # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. if self.on: if isinstance(result, cudf.Index): - return result.sort_values() + sort_order = result._get_sorted_inds() else: - return result.sort_values( - _coerce_to_list(self.on), ignore_index=True - ) + sort_order = result._get_sorted_inds(_coerce_to_list(self.on)) + return result._gather(sort_order, keep_index=False) by = [] if self.left_index and self.right_index: - by.extend(result.index._data.columns) + if result._index is not None: + by.extend(result._index._data.columns) if self.left_on: by.extend( [result._data[col] for col in _coerce_to_list(self.left_on)] @@ -360,7 +360,7 @@ def _sort_result(self, result): if by: to_sort = cudf.DataFrame._from_columns(by) sort_order = to_sort.argsort() - result = result.take(sort_order) + result = result._gather(sort_order) return result @staticmethod @@ -423,7 +423,7 @@ def _validate_merge_params( "lsuffix and rsuffix are not defined" ) - def _match_key_dtypes(self, lhs, rhs): + def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: # Match the dtypes of the key columns from lhs and rhs out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) @@ -435,7 +435,9 @@ def _match_key_dtypes(self, lhs, rhs): right_key.set(out_rhs, rcol.astype(dtype)) return out_lhs, out_rhs - def _restore_categorical_keys(self, lhs, rhs): + def _restore_categorical_keys( + self, lhs: Frame, rhs: Frame + ) -> Tuple[Frame, Frame]: # For inner joins, any categorical keys in `self.lhs` and `self.rhs` # were casted to their category type to produce `lhs` and `rhs`. # Here, we cast them back. @@ -460,5 +462,5 @@ def _restore_categorical_keys(self, lhs, rhs): class MergeSemi(Merge): _joiner = libcudf.join.semi_join - def _merge_results(self, lhs, rhs): + def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: return super()._merge_results(lhs, cudf.core.frame.Frame()) From 558aa15b7a3eba4a302e5648b9057c9c39e4455f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 15:41:59 -0500 Subject: [PATCH 060/138] Simplify joiner API --- python/cudf/cudf/_lib/join.pyx | 16 +++---- python/cudf/cudf/core/join/_join_helpers.py | 28 +++++++---- python/cudf/cudf/core/join/join.py | 52 +++++++++------------ 3 files changed, 46 insertions(+), 50 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 0339d86384d..f1677e3f856 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -19,13 +19,11 @@ from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join -cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): +cpdef join(Table lhs, Table rhs, how=None): # left, inner and outer join - cdef vector[size_type] c_left_on = left_on - cdef vector[size_type] c_right_on = right_on cdef pair[unique_ptr[column], unique_ptr[column]] c_result - cdef table_view c_lhs = lhs.view().select(c_left_on) - cdef table_view c_rhs = rhs.view().select(c_right_on) + cdef table_view c_lhs = lhs.view() + cdef table_view c_rhs = rhs.view() if how == "inner": c_result = move(cpp_join.inner_join( @@ -50,15 +48,13 @@ cpdef join(Table lhs, Table rhs, left_on, right_on, how=None): ) -cpdef semi_join(Table lhs, Table rhs, left_on, right_on, how=None): +cpdef semi_join(Table lhs, Table rhs, how=None): from cudf.core.column import as_column # left-semi and left-anti joins - cdef vector[size_type] c_left_on = left_on - cdef vector[size_type] c_right_on = right_on cdef unique_ptr[column] c_result - cdef table_view c_lhs = lhs.view().select(c_left_on) - cdef table_view c_rhs = rhs.view().select(c_right_on) + cdef table_view c_lhs = lhs.view() + cdef table_view c_rhs = rhs.view() if how == "leftsemi": c_result = move(cpp_join.left_semi_join( diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index a499d3d33de..02a1a49564a 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Iterable import numpy as np import pandas as pd @@ -51,16 +51,24 @@ def set(self, obj: Frame, value: ColumnBase): else: raise KeyError() - def get_numeric_index(self, obj: Frame) -> int: - # get the position of the column in `obj` - # (counting any index columns) - if self.column: - index_nlevels = obj._index.nlevels if obj._index is not None else 0 - return index_nlevels + tuple(obj._data).index(self.name) + +def _frame_select_by_indexers( + frame: Frame, indexers: Iterable[_Indexer] +) -> Frame: + # Select columns from the given `Frame` using `indexers`, + # and return a new `Frame`. + index_data = frame._data.__class__() + data = frame._data.__class__() + + for idx in indexers: + if idx.index: + index_data[idx.name] = idx.get(frame) else: - if obj._index is not None: - return obj._index.names.index(self.name) - raise KeyError() + data[idx.name] = idx.get(frame) + + result_index = cudf.Index._from_data(index_data) if index_data else None + result = cudf.core.frame.Frame(data=data, index=result_index) + return result def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 61fe6bfc082..1377ecf5df8 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,13 +2,14 @@ from __future__ import annotations from collections import OrderedDict, namedtuple -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Callable, Tuple import cudf from cudf import _lib as libcudf from cudf.core.join._join_helpers import ( _coerce_to_list, _coerce_to_tuple, + _frame_select_by_indexers, _Indexer, _match_join_keys, ) @@ -66,18 +67,17 @@ class Merge(object): # The joiner function must have the following signature: # - # def joiner(lhs, rhs, left_on, right_on, how=how): + # def joiner(lhs, rhs, how=how): # ... # - # Where: + # where: # - # - `lhs` and `rhs` represent the left and right Frames to join - # - `left_on` and `right_on` represent the *numerical* indices - # of the key columns of lhs and rhs. This allows specifying - # index levels as keys in an unambiguous way. + # - `lhs` and `rhs` are Frames composed of the left and right join keys # - `how` is a string specifying the kind of join to perform - # (useful if the joiner function can perform more than one join). - _joiner = libcudf.join.join + # + # ...and it returns a tuple of two gather maps representing the rows + # to gather from the left- and right- side tables respectively. + _joiner: Callable = libcudf.join.join def __init__( self, @@ -166,33 +166,28 @@ def __init__( self.rsuffix = rsuffix self.suffixes = suffixes - self._out_class = cudf.DataFrame + self._compute_join_keys() + + @property + def _out_class(self): + out_class = cudf.DataFrame if isinstance(self.lhs, cudf.MultiIndex) or isinstance( self.rhs, cudf.MultiIndex ): - self._out_class = cudf.MultiIndex + out_class = cudf.MultiIndex elif isinstance(self.lhs, cudf.Index): - self._out_class = self.lhs.__class__ - - self._compute_join_keys() + out_class = self.lhs.__class__ + return out_class def perform_merge(self) -> Frame: lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs) - left_key_indices = [ - key.get_numeric_index(lhs) for key in self._keys.left - ] - right_key_indices = [ - key.get_numeric_index(rhs) for key in self._keys.right - ] + left_table = _frame_select_by_indexers(lhs, self._keys.left) + right_table = _frame_select_by_indexers(rhs, self._keys.right) left_rows, right_rows = self._joiner( - lhs, - rhs, - left_on=left_key_indices, - right_on=right_key_indices, - how=self.how, + left_table, right_table, how=self.how, ) lhs, rhs = self._restore_categorical_keys(lhs, rhs) @@ -307,7 +302,7 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: del right_names[name] # Assemble the data columns of the result: - data = cudf.core.column_accessor.ColumnAccessor() + data = left_result._data.__class__() for lcol in left_names: data[left_names[lcol]] = left_result._data[lcol] @@ -316,10 +311,7 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: # Index of the result: if self.left_index and self.right_index: - if self.how == "right": - index = right_result._index - else: - index = left_result._index + index = left_result._index elif self.left_index: # left_index and right_on index = right_result._index From 31848962927b335d4db855baba0528e77f82906b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Feb 2021 16:07:30 -0500 Subject: [PATCH 061/138] Example doc --- cpp/include/cudf/join.hpp | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 2707c60fa34..fcbaf0da795 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -32,10 +32,33 @@ namespace cudf { */ /** - * @brief Performs an inner join on the specified columns of two - * tables (`left`, `right`), and returns the row indices corresponding - * to the result. - */ // TODO: explain this better + * @brief Returns the row indices to use when constructing + * the result of performing an inner join between two tables. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Result: {{1, 2}, {0, 1}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Result: {{1}, {0}} + * + * @throw cudf::logic_error if number of elements in `left_on` or `right_on` + * mismatch. + * @throw cudf::logic_error if number of columns in either `left` or `right` + * table is 0 or exceeds MAX_JOIN_SIZE + * + * @param[in] left A table representing the keys of the left table of the join + * @param[in] right A table representing the keys of the right table of the join + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing an inner join between two tables with `left_keys` and `right_keys` + * as the join keys . + */ std::pair, std::unique_ptr> inner_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, From d3535dcfaa897a674192946185a50bf4072c3d12 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 25 Feb 2021 16:08:36 -0500 Subject: [PATCH 062/138] Refactor join APIs to return a device_uvector --- cpp/include/cudf/join.hpp | 16 ++--- cpp/src/join/hash_join.cu | 45 ++++-------- cpp/src/join/hash_join.cuh | 8 +-- cpp/src/join/join.cu | 138 ++++++++++++++++--------------------- cpp/src/join/semi_join.cu | 44 +++++------- 5 files changed, 103 insertions(+), 148 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index fcbaf0da795..b3988beaef6 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -59,7 +59,7 @@ namespace cudf { * the result of performing an inner join between two tables with `left_keys` and `right_keys` * as the join keys . */ -std::pair, std::unique_ptr> inner_join( +std::pair, rmm::device_uvector> inner_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -135,7 +135,7 @@ std::unique_ptr inner_join( * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better -std::pair, std::unique_ptr> left_join( +std::pair, rmm::device_uvector> left_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -213,7 +213,7 @@ std::unique_ptr left_join( * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better -std::pair, std::unique_ptr> full_join( +std::pair, rmm::device_uvector> full_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -342,7 +342,7 @@ std::unique_ptr left_semi_join( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** TODO: document */ -std::unique_ptr left_semi_join( +rmm::device_uvector left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -404,7 +404,7 @@ std::unique_ptr left_anti_join( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** TODO: document */ -std::unique_ptr left_anti_join( +rmm::device_uvector left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -473,7 +473,7 @@ class hash_join { * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better - std::pair, std::unique_ptr> inner_join( + std::pair, rmm::device_uvector> inner_join( cudf::table_view const& probe, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -484,7 +484,7 @@ class hash_join { * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better - std::pair, std::unique_ptr> left_join( + std::pair, rmm::device_uvector> left_join( cudf::table_view const& probe, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -495,7 +495,7 @@ class hash_join { * tables (`left`, `right`), and returns the row indices corresponding * to the result. */ // TODO: explain this better - std::pair, std::unique_ptr> full_join( + std::pair, rmm::device_uvector> full_join( cudf::table_view const& probe, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index d8fe8870001..cb0e5bc4901 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include @@ -105,7 +106,12 @@ get_left_join_indices_complement(rmm::device_uvector &right_indices, 0); } else { // Assume all the indices in invalid_index_map are invalid - rmm::device_vector invalid_index_map(right_table_row_count, 1); + rmm::device_uvector invalid_index_map(right_table_row_count, stream); + thrust::uninitialized_fill(thrust::cuda::par.on(stream.value()), + invalid_index_map.begin(), + invalid_index_map.end(), + int32_t{1}); + // Functor to check for index validity since left joins can create invalid indices valid_range valid(0, right_table_row_count); @@ -295,7 +301,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build, compare_nulls, stream); } -std::pair, std::unique_ptr> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -305,7 +311,7 @@ hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, return compute_hash_join(probe, compare_nulls, stream, mr); } -std::pair, std::unique_ptr> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::left_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -315,7 +321,7 @@ hash_join::hash_join_impl::left_join(cudf::table_view const &probe, return compute_hash_join(probe, compare_nulls, stream, mr); } -std::pair, std::unique_ptr> +std::pair, rmm::device_uvector> hash_join::hash_join_impl::full_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -327,10 +333,10 @@ hash_join::hash_join_impl::full_join(cudf::table_view const &probe, template std::pair, rmm::device_uvector> -hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, @@ -353,29 +359,6 @@ hash_join::hash_join_impl::compute_hash_join_indices(cudf::table_view const &pro return probe_join_indices(probe, compare_nulls, stream); } -template -std::pair, std::unique_ptr> -hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const -{ - auto join_indices = compute_hash_join_indices(probe, compare_nulls, stream, mr); - auto join_size = join_indices.first.size(); - auto left_map = std::make_unique(cudf::data_type(type_to_id()), - join_size, - join_indices.first.release(), - rmm::device_buffer{}, - 0); - auto right_map = std::make_unique(cudf::data_type(type_to_id()), - join_size, - join_indices.second.release(), - rmm::device_buffer{}, - 0); - return std::make_pair, std::unique_ptr>( - std::move(left_map), std::move(right_map)); -} - template std::pair, rmm::device_uvector> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index d547d5190c4..cebf8fd612e 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -230,19 +230,19 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - std::pair, std::unique_ptr> inner_join( + std::pair, rmm::device_uvector> inner_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::pair, std::unique_ptr> left_join( + std::pair, rmm::device_uvector> left_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; - std::pair, std::unique_ptr> full_join( + std::pair, rmm::device_uvector> full_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -257,7 +257,7 @@ struct hash_join::hash_join_impl { rmm::mr::device_memory_resource* mr) const; template - std::pair, std::unique_ptr> compute_hash_join( + std::pair, rmm::device_uvector> compute_hash_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 15b1f216928..a9f4b507efc 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -26,7 +26,7 @@ namespace cudf { namespace detail { -std::pair, std::unique_ptr> inner_join( +std::pair, rmm::device_uvector> inner_join( table_view const& left_input, table_view const& right_input, null_equality compare_nulls, @@ -50,7 +50,7 @@ std::pair, std::unique_ptr> inner_jo if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); auto result = hj_obj.inner_join(right, compare_nulls, stream, mr); - return std::make_pair, std::unique_ptr>( + return std::make_pair, rmm::device_uvector>( std::move(result.second), std::move(result.first)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); @@ -82,48 +82,40 @@ std::unique_ptr
inner_join(table_view const& left_input, // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left.select(left_on), compare_nulls, stream); - auto join_indices = hj_obj.inner_join(right.select(right_on), compare_nulls, stream, mr); - auto join_indices_view = std::make_pair( - join_indices.first->view(), join_indices.second->view()); - std::unique_ptr
left_result = - detail::gather(left, - join_indices_view.second.template begin(), - join_indices_view.second.template end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - std::unique_ptr
right_result = - detail::gather(right, - join_indices_view.first.template begin(), - join_indices_view.first.template end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); + auto join_indices = hj_obj.inner_join(right.select(right_on), compare_nulls, stream, mr); + std::unique_ptr
left_result = detail::gather(left, + join_indices.second.begin(), + join_indices.second.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.first.begin(), + join_indices.first.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); return combine_table_pair(std::move(left_result), std::move(right_result)); } else { cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); - auto join_indices = hj_obj.inner_join(left.select(left_on), compare_nulls, stream, mr); - auto join_indices_view = std::make_pair( - join_indices.first->view(), join_indices.second->view()); - std::unique_ptr
left_result = - detail::gather(left, - join_indices_view.first.template begin(), - join_indices_view.first.template end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - std::unique_ptr
right_result = - detail::gather(right, - join_indices_view.second.template begin(), - join_indices_view.second.template end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); + auto join_indices = hj_obj.inner_join(left.select(left_on), compare_nulls, stream, mr); + std::unique_ptr
left_result = detail::gather(left, + join_indices.first.begin(), + join_indices.first.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second.begin(), + join_indices.second.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); return combine_table_pair(std::move(left_result), std::move(right_result)); } } -std::pair, std::unique_ptr> left_join( +std::pair, rmm::device_uvector> left_join( table_view const& left_input, table_view const& right_input, null_equality compare_nulls, @@ -171,27 +163,22 @@ std::unique_ptr
left_join(table_view const& left_input, return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } - - auto join_indices_view = std::make_pair( - join_indices.first->view(), join_indices.second->view()); - std::unique_ptr
left_result = - detail::gather(left, - join_indices_view.first.template begin(), - join_indices_view.first.template end(), - out_of_bounds_policy::NULLIFY, - stream, - mr); - std::unique_ptr
right_result = - detail::gather(right, - join_indices_view.second.template begin(), - join_indices_view.second.template end(), - out_of_bounds_policy::NULLIFY, - stream, - mr); + std::unique_ptr
left_result = detail::gather(left, + join_indices.first.begin(), + join_indices.first.end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second.begin(), + join_indices.second.end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); return combine_table_pair(std::move(left_result), std::move(right_result)); } -std::pair, std::unique_ptr> full_join( +std::pair, rmm::device_uvector> full_join( table_view const& left_input, table_view const& right_input, null_equality compare_nulls, @@ -239,23 +226,18 @@ std::unique_ptr
full_join(table_view const& left_input, return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } - - auto join_indices_view = std::make_pair( - join_indices.first->view(), join_indices.second->view()); - std::unique_ptr
left_result = - detail::gather(left, - join_indices_view.first.template begin(), - join_indices_view.first.template end(), - out_of_bounds_policy::NULLIFY, - stream, - mr); - std::unique_ptr
right_result = - detail::gather(right, - join_indices_view.second.template begin(), - join_indices_view.second.template end(), - out_of_bounds_policy::NULLIFY, - stream, - mr); + std::unique_ptr
left_result = detail::gather(left, + join_indices.first.begin(), + join_indices.first.end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second.begin(), + join_indices.second.end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); return combine_table_pair(std::move(left_result), std::move(right_result)); } @@ -270,7 +252,7 @@ hash_join::hash_join(cudf::table_view const& build, { } -std::pair, std::unique_ptr> hash_join::inner_join( +std::pair, rmm::device_uvector> hash_join::inner_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -279,7 +261,7 @@ std::pair, std::unique_ptr> hash_joi return impl->inner_join(probe, compare_nulls, stream, mr); } -std::pair, std::unique_ptr> hash_join::left_join( +std::pair, rmm::device_uvector> hash_join::left_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -288,7 +270,7 @@ std::pair, std::unique_ptr> hash_joi return impl->left_join(probe, compare_nulls, stream, mr); } -std::pair, std::unique_ptr> hash_join::full_join( +std::pair, rmm::device_uvector> hash_join::full_join( cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -299,7 +281,7 @@ std::pair, std::unique_ptr> hash_joi // external APIs -std::pair, std::unique_ptr> inner_join( +std::pair, rmm::device_uvector> inner_join( table_view const& left, table_view const& right, null_equality compare_nulls, @@ -321,7 +303,7 @@ std::unique_ptr
inner_join(table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::pair, std::unique_ptr> left_join( +std::pair, rmm::device_uvector> left_join( table_view const& left, table_view const& right, null_equality compare_nulls, @@ -343,7 +325,7 @@ std::unique_ptr
left_join(table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::pair, std::unique_ptr> full_join( +std::pair, rmm::device_uvector> full_join( table_view const& left, table_view const& right, null_equality compare_nulls, diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index e5966dd01e4..f1b797aa9f5 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -38,7 +38,7 @@ namespace cudf { namespace detail { template -std::unique_ptr left_semi_anti_join( +rmm::device_uvector left_semi_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls, @@ -49,18 +49,12 @@ std::unique_ptr left_semi_anti_join( CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); if (is_trivial_join(left_keys, right_keys, JoinKind)) { - return std::make_unique(cudf::data_type(type_to_id()), - 0, - rmm::device_buffer{}, - rmm::device_buffer{}, - 0); + return rmm::device_uvector(0, stream); } if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid(true, stream); - static_cast(zero.get())->set_value(0, stream); - return cudf::detail::sequence(left_keys.num_rows(), *zero, stream); + auto result = rmm::device_uvector(left_keys.num_rows(), stream); + thrust::sequence(thrust::cuda::par.on(stream.value()), result.begin(), result.end()); + return std::move(result); } auto const left_num_rows = left_keys.num_rows(); @@ -103,7 +97,7 @@ std::unique_ptr left_semi_anti_join( // For semi join we want contains to be true, for anti join we want contains to be false bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); - rmm::device_uvector gather_map(left_num_rows, stream); + rmm::device_uvector gather_map(left_num_rows, stream); // gather_map_end will be the end of valid data in gather_map auto gather_map_end = thrust::copy_if( @@ -117,11 +111,7 @@ std::unique_ptr left_semi_anti_join( }); auto join_size = thrust::distance(gather_map.begin(), gather_map_end); - return std::make_unique(cudf::data_type(type_to_id()), - join_size, - gather_map.release(), - rmm::device_buffer{}, - 0); + return std::move(gather_map); } /** @@ -195,8 +185,8 @@ std::unique_ptr left_semi_anti_join( auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather(left_updated, - gather_map->view().template begin(), - gather_map->view().template end(), + gather_map.begin(), + gather_map.end(), out_of_bounds_policy::DONT_CHECK, stream, mr); @@ -216,10 +206,10 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr left_semi_join(cudf::table_view const& left, - cudf::table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +rmm::device_uvector left_semi_join(cudf::table_view const& left, + cudf::table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( @@ -238,10 +228,10 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr left_anti_join(cudf::table_view const& left, - cudf::table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +rmm::device_uvector left_anti_join(cudf::table_view const& left, + cudf::table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( From b82181d4a35a7d46b27939ae525feb45101dec24 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 3 Mar 2021 11:22:00 -0500 Subject: [PATCH 063/138] docs --- cpp/include/cudf/join.hpp | 269 ++++++++++++++++++++++---------------- 1 file changed, 158 insertions(+), 111 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b3988beaef6..b57a9ca079e 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -32,7 +32,7 @@ namespace cudf { */ /** - * @brief Returns the row indices to use when constructing + * @brief Returns the row indices that can be used to construct * the result of performing an inner join between two tables. * * @code{.pseudo} @@ -44,10 +44,8 @@ namespace cudf { * Right: {{1, 2, 3}, {4, 6, 7}} * Result: {{1}, {0}} * - * @throw cudf::logic_error if number of elements in `left_on` or `right_on` + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` * mismatch. - * @throw cudf::logic_error if number of columns in either `left` or `right` - * table is 0 or exceeds MAX_JOIN_SIZE * * @param[in] left A table representing the keys of the left table of the join * @param[in] right A table representing the keys of the right table of the join @@ -73,26 +71,13 @@ std::pair, rmm::device_uvector> inner_ * in the columns being joined on match. * * @code{.pseudo} - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, a: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * columns_in_common: { {0, 1} } - * Result: { a: {1, 2}, b: {1, 2} } - * - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} - * left_on: {0} - * right_on: {0} - * columns_in_common: { } - * Result: { a: {1, 2}, b: {1, 2}, c: {1, 2} } + * Result: {{1, 2}, {1, 2}, {1, 2}} * @endcode * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) such that the location of `L` within `left_on` is not equal to - * location of R within `right_on` * @throw cudf::logic_error if number of elements in `left_on` or `right_on` * mismatch. * @throw cudf::logic_error if number of columns in either `left` or `right` @@ -108,19 +93,12 @@ std::pair, rmm::device_uvector> inner_ * @param[in] right_on The column indices from `right` to join on. * The column from `right` indicated by `right_on[i]` will be compared against the column * from `left` indicated by `left_on[i]`. - * @param[in] columns_in_common is a vector of pairs of column indices into - * `left` and `right`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `left_on` columns. Else, for every column in `left_on` and `right_on`, - * an output column will be produced. For each of these pairs (L, R), L - * should exist in `left_on` and R should exist in `right_on`. * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table will be joined columns of - * `left(including common columns)+right(excluding common columns)`. + * specified by `left_on` and `right_on`. */ std::unique_ptr inner_join( cudf::table_view const& left, @@ -130,11 +108,35 @@ std::unique_ptr inner_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** - * @brief Performs a left join on the specified columns of two - * tables (`left`, `right`), and returns the row indices corresponding - * to the result. - */ // TODO: explain this better + * @brief Returns the row indices that can be used to construct + * the result of performing a left join between two tables. + * For rows in the right table that do not have a match in the + * left table, the row index is an unspecified out-of-bounds value. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Result: {{0, 1, 2}, {None, 0, 1}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Result: {{0, 1, 2}, {None, 0, None}} + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param[in] left A table representing the keys of the left table of the join + * @param[in] right A table representing the keys of the right table of the join + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a left join between two tables with `left_keys` and `right_keys` + * as the join keys . + */ std::pair, rmm::device_uvector> left_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, @@ -145,32 +147,25 @@ std::pair, rmm::device_uvector> left_j * @brief Performs a left join (also known as left outer join) on the * specified columns of two tables (`left`, `right`) * - * Left Join returns all the rows from the left table and those rows from the + * Left join returns all the rows from the left table and those rows from the * right table that match on the joined columns. * For rows from the right table that do not have a match, the corresponding * values in the left columns will be null. * * @code{.pseudo} - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, a: {1 ,2 ,5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2 ,5}} * left_on: {0} * right_on: {1} - * columns_in_common: { {0, 1} } - * Result: { a: {0, 1, 2}, b: {NULL, 1, 2} } + * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} } * - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {0} - * columns_in_common: { } - * Result: { a: {0, 1, 2}, b: {NULL, 1, 2}, c: {NULL, 1, 2} } + * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} } * @endcode * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) such that the location of `L` within `left_on` is not equal to - * location of R within `right_on` * @throw cudf::logic_error if number of elements in `left_on` or `right_on` * mismatch. * @throw cudf::logic_error if number of columns in either `left` or `right` @@ -186,19 +181,12 @@ std::pair, rmm::device_uvector> left_j * @param[in] right_on The column indices from `right` to join on. * The column from `right` indicated by `right_on[i]` will be compared against the column * from `left` indicated by `left_on[i]`. - * @param[in] columns_in_common is a vector of pairs of column indices into - * `left` and `right`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `left_on` columns. Else, for every column in `left_on` and `right_on`, - * an output column will be produced. For each of these pairs (L, R), L - * should exist in `left_on` and R should exist in `right_on`. * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table will be joined columns of - * `left(including common columns)+right(excluding common columns)`. + * specified by `left_on` and `right_on`. */ std::unique_ptr left_join( cudf::table_view const& left, @@ -208,11 +196,33 @@ std::unique_ptr left_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** - * @brief Performs a left join on the specified columns of two - * tables (`left`, `right`), and returns the row indices corresponding - * to the result. - */ // TODO: explain this better + * @brief Returns the row indices that can be used to construct + * the result of performing a full join between two tables. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Result: {{0, 1, 2, None}, {None, 0, 1, 2}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param[in] left A table representing the keys of the left table of the join + * @param[in] right A table representing the keys of the right table of the join + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a full join between two tables with `left_keys` and `right_keys` + * as the join keys . + */ std::pair, rmm::device_uvector> full_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, @@ -229,26 +239,19 @@ std::pair, rmm::device_uvector> full_j * values in the left columns will be null. * * @code{.pseudo} - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * columns_in_common: { {0, 1} } - * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} } + * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} } * - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {0} - * columns_in_common: { } - * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} } + * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} } * @endcode * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) such that the location of `L` within `left_on` is not equal to - * location of R within `right_on` * @throw cudf::logic_error if number of elements in `left_on` or `right_on` * mismatch. * @throw cudf::logic_error if number of columns in either `left` or `right` @@ -264,19 +267,12 @@ std::pair, rmm::device_uvector> full_j * @param[in] right_on The column indices from `right` to join on. * The column from `right` indicated by `right_on[i]` will be compared against the column * from `left` indicated by `left_on[i]`. - * @param[in] columns_in_common is a vector of pairs of column indices into - * `left` and `right`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `left_on` columns. Else, for every column in `left_on` and `right_on`, - * an output column will be produced. For each of these pairs (L, R), L - * should exist in `left_on` and R should exist in `right_on`. * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table will be joined columns of - * `left(including common columns)+right(excluding common columns)`. + * specified by `left_on` and `right_on`. */ std::unique_ptr full_join( cudf::table_view const& left, @@ -294,24 +290,20 @@ std::unique_ptr full_join( * returns rows that exist in the right table. * * @code{.pseudo} - * TableA a: {0, 1, 2} - * TableB b: {1, 2, 3}, a: {1, 2, 5} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * return_columns: { 0 } - * Result: { a: {1, 2} } + * Result: { {1, 2} } * - * TableA a: {0, 1, 2}, c: {1, 2, 5} - * TableB b: {1, 2, 3} + * TableA {{0, 1, 2}, {1, 2, 5}} + * TableB {{1, 2, 3}} * left_on: {0} * right_on: {0} - * return_columns: { 1 } - * Result: { c: {1, 2} } + * Result: { {1, 2}, {2, 5} } * @endcode * - * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0 - * @throw cudf::logic_error if the number of returned columns is 0 - * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal + * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 * * @param[in] left The left table * @param[in] right The right table @@ -323,15 +315,12 @@ std::unique_ptr full_join( * The column from `right` indicated by `right_on[i]` * will be compared against the column from `left` * indicated by `left_on[i]`. - * @param[in] return_columns A vector of column indices from `left` to - * include in the returned table. * @param[in] compare_nulls Controls whether null join-key values should match or not. * @param[in] mr Device memory resource used to allocate the returned table's * device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table - * will contain `return_columns` from `left` that match in right. + * specified by `left_on` and `right_on`. */ std::unique_ptr left_semi_join( cudf::table_view const& left, @@ -341,7 +330,40 @@ std::unique_ptr left_semi_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** TODO: document */ + +/** + * @brief Returns the row indices that can be used to construct + * the result of performing a left semi join between two tables. + * + * @code{.pseudo} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}, {1, 2, 5}} + * left_on: {0} + * right_on: {1} + * Result: {1, 2} + * + * TableA {{0, 1, 2}, {1, 2, 5}} + * TableB {{1, 2, 3}} + * left_on: {0} + * right_on: {0} + * Result: {1, 2} + * @endcode + * + * @throw cudf::logic_error if number of elements in `left_on` or `right_on` + * mismatch. + * @throw cudf::logic_error if number of columns in either `left` or `right` + * table is 0 or exceeds MAX_JOIN_SIZE + * + * @param[in] left A table representing the keys of the left table of the join + * @param[in] right A table representing the keys of the right table of the join + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A column `left_indices` that can be used to construct + * the result of performing a left semi join between two tables with + * `left_keys` and `right_keys` as the join keys . + */ rmm::device_uvector left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, @@ -356,24 +378,23 @@ rmm::device_uvector left_semi_join( * returns rows that do not exist in the right table. * * @code{.pseudo} - * TableA a: {0, 1, 2} - * TableB b: {1, 2, 3}, a: {1, 2, 5} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * return_columns: { 0 } - * Result: { a: {0} } + * Result: {{0}, {1}} * - * TableA a: {0, 1, 2}, c: {1, 2, 5} - * TableB b: {1, 2, 3} + * TableA: {{0, 1, 2}, {1, 2, 5}} + * TableB: {{1, 2, 3}} * left_on: {0} * right_on: {0} - * return_columns: { 1 } - * Result: { c: {1} } + * Result: { {0} {1} } * @endcode * - * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0 - * @throw cudf::logic_error if the number of returned columns is 0 - * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal + * @throw cudf::logic_error if number of elements in `left_on` or `right_on` + * mismatch. + * @throw cudf::logic_error if number of columns in either `left` or `right` + * table is 0 or exceeds MAX_JOIN_SIZE * * @param[in] left The left table * @param[in] right The right table @@ -385,15 +406,12 @@ rmm::device_uvector left_semi_join( * The column from `right` indicated by `right_on[i]` * will be compared against the column from `left` * indicated by `left_on[i]`. - * @param[in] return_columns A vector of column indices from `left` to - * include in the returned table. * @param[in] compare_nulls Controls whether null join-key values should match or not. * @param[in] mr Device memory resource used to allocate the returned table's * device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table - * will contain `return_columns` from `left` that match in right. + * specified by `left_on` and `right_on`. */ std::unique_ptr left_anti_join( cudf::table_view const& left, @@ -403,7 +421,36 @@ std::unique_ptr left_anti_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** TODO: document */ +/** + * @brief Returns the row indices that can be used to construct + * the result of performing a left anti join between two tables. + * + * @code{.pseudo} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}, {1, 2, 5}} + * left_on: {0} + * right_on: {1} + * Result: {0} + * + * TableA: {{0, 1, 2}, {1, 2, 5}} + * TableB: {{1, 2, 3}} + * left_on: {0} + * right_on: {0} + * Result: {0} + * @endcode + * + * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 + * + * @param[in] left A table representing the keys of the left table of the join + * @param[in] right A table representing the keys of the right table of the join + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A column `left_indices` that can be used to construct + * the result of performing a left anti join between two tables with + * `left_keys` and `right_keys` as the join keys . + */ rmm::device_uvector left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, From 77d2bfdf1d5ecc62db76b42c883ebe42f566b3e4 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 3 Mar 2021 11:29:47 -0500 Subject: [PATCH 064/138] Finish up docs? --- cpp/include/cudf/join.hpp | 53 +++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b57a9ca079e..2f6981fb81c 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -507,7 +507,6 @@ class hash_join { * undefined. * * @param build The build table, from which the hash table is built. - * @param build_on The column indices from `build` to join on. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches */ @@ -516,10 +515,19 @@ class hash_join { rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @brief Performs an inner join on the specified columns of two - * tables (`left`, `right`), and returns the row indices corresponding - * to the result. - */ // TODO: explain this better + * Returns the row indices that can be used to construct the result of performing + * an inner join between two tables. @see cudf::inner_join(). + * + * @param probe The probe table, from which the tuples are probed. + * @param compare_nulls Controls whether null join-key values should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device + * memory. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing an inner join between two tables with `build` and `probe` + * as the the join keys . + */ std::pair, rmm::device_uvector> inner_join( cudf::table_view const& probe, null_equality compare_nulls = null_equality::EQUAL, @@ -527,21 +535,40 @@ class hash_join { rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** - * @brief Performs a left join on the specified columns of two - * tables (`left`, `right`), and returns the row indices corresponding - * to the result. - */ // TODO: explain this better + * Returns the row indices that can be used to construct the result of performing + * a left join between two tables. @see cudf::left_join(). + * + * @param probe The probe table, from which the tuples are probed. + * @param compare_nulls Controls whether null join-key values should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device + * memory. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a left join between two tables with `build` and `probe` + * as the the join keys . + */ std::pair, rmm::device_uvector> left_join( cudf::table_view const& probe, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + /** - * @brief Performs a full join on the specified columns of two - * tables (`left`, `right`), and returns the row indices corresponding - * to the result. - */ // TODO: explain this better + * Returns the row indices that can be used to construct the result of performing + * a full join between two tables. @see cudf::full_join(). + * + * @param probe The probe table, from which the tuples are probed. + * @param compare_nulls Controls whether null join-key values should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device + * memory. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a full join between two tables with `build` and `probe` + * as the the join keys . + */ std::pair, rmm::device_uvector> full_join( cudf::table_view const& probe, null_equality compare_nulls = null_equality::EQUAL, From 26a3fb0b32d27e0051955d13af42b4dd548c773c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 4 Mar 2021 15:01:58 -0500 Subject: [PATCH 065/138] Fix join tests --- cpp/tests/join/join_tests.cpp | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index fbde179d33d..1b910bfd0cc 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1072,8 +1072,15 @@ TEST_F(JoinTest, HashJoinSequentialProbes) Table t0(std::move(cols0)); - auto result = hash_join.full_join(t0); - auto result_table = cudf::table_view({result.first->view(), result.second->view()}); + auto result = hash_join.full_join(t0); + + auto result_table = + cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.first.size()), + result.first.data()}, + cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.second.size()), + result.second.data()}}); auto result_sort_order = cudf::sorted_order(result_table); auto sorted_result = cudf::gather(result_table, *result_sort_order); @@ -1098,8 +1105,14 @@ TEST_F(JoinTest, HashJoinSequentialProbes) Table t0(std::move(cols0)); - auto result = hash_join.left_join(t0); - auto result_table = cudf::table_view({result.first->view(), result.second->view()}); + auto result = hash_join.left_join(t0); + auto result_table = + cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.first.size()), + result.first.data()}, + cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.second.size()), + result.second.data()}}); auto result_sort_order = cudf::sorted_order(result_table); auto sorted_result = cudf::gather(result_table, *result_sort_order); @@ -1124,8 +1137,14 @@ TEST_F(JoinTest, HashJoinSequentialProbes) Table t0(std::move(cols0)); - auto result = hash_join.inner_join(t0); - auto result_table = cudf::table_view({result.first->view(), result.second->view()}); + auto result = hash_join.inner_join(t0); + auto result_table = + cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.first.size()), + result.first.data()}, + cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.second.size()), + result.second.data()}}); auto result_sort_order = cudf::sorted_order(result_table); auto sorted_result = cudf::gather(result_table, *result_sort_order); From 8a60d622b2bb9353fb79defb1650bdc294355f33 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 5 Mar 2021 10:50:22 -0500 Subject: [PATCH 066/138] Refactor join APIs to work with unique_ptr> --- cpp/include/cudf/join.hpp | 74 +++++++-------- cpp/src/join/hash_join.cu | 135 ++++++++++++++------------- cpp/src/join/hash_join.cuh | 74 +++++++-------- cpp/src/join/join.cu | 142 +++++++++++++++-------------- cpp/src/join/join_common_utils.hpp | 3 +- cpp/src/join/semi_join.cu | 37 ++++---- cpp/tests/join/join_tests.cpp | 24 ++--- 7 files changed, 260 insertions(+), 229 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 2f6981fb81c..b1f06697522 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -57,11 +57,12 @@ namespace cudf { * the result of performing an inner join between two tables with `left_keys` and `right_keys` * as the join keys . */ -std::pair, rmm::device_uvector> inner_join( - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair>, + std::unique_ptr>> +inner_join(cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs an inner join on the specified columns of two @@ -108,7 +109,6 @@ std::unique_ptr inner_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns the row indices that can be used to construct * the result of performing a left join between two tables. @@ -137,11 +137,12 @@ std::unique_ptr inner_join( * the result of performing a left join between two tables with `left_keys` and `right_keys` * as the join keys . */ -std::pair, rmm::device_uvector> left_join( - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair>, + std::unique_ptr>> +left_join(cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a left join (also known as left outer join) on the @@ -196,7 +197,6 @@ std::unique_ptr left_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns the row indices that can be used to construct * the result of performing a full join between two tables. @@ -223,11 +223,12 @@ std::unique_ptr left_join( * the result of performing a full join between two tables with `left_keys` and `right_keys` * as the join keys . */ -std::pair, rmm::device_uvector> full_join( - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair>, + std::unique_ptr>> +full_join(cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a full join (also known as full outer join) on the @@ -330,7 +331,6 @@ std::unique_ptr left_semi_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns the row indices that can be used to construct * the result of performing a left semi join between two tables. @@ -364,7 +364,7 @@ std::unique_ptr left_semi_join( * the result of performing a left semi join between two tables with * `left_keys` and `right_keys` as the join keys . */ -rmm::device_uvector left_semi_join( +std::unique_ptr> left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -451,7 +451,7 @@ std::unique_ptr left_anti_join( * the result of performing a left anti join between two tables with * `left_keys` and `right_keys` as the join keys . */ -rmm::device_uvector left_anti_join( +std::unique_ptr> left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -528,11 +528,12 @@ class hash_join { * the result of performing an inner join between two tables with `build` and `probe` * as the the join keys . */ - std::pair, rmm::device_uvector> inner_join( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair>, + std::unique_ptr>> + inner_join(cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** * Returns the row indices that can be used to construct the result of performing @@ -548,12 +549,12 @@ class hash_join { * the result of performing a left join between two tables with `build` and `probe` * as the the join keys . */ - std::pair, rmm::device_uvector> left_join( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; - + std::pair>, + std::unique_ptr>> + left_join(cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** * Returns the row indices that can be used to construct the result of performing @@ -569,11 +570,12 @@ class hash_join { * the result of performing a full join between two tables with `build` and `probe` * as the the join keys . */ - std::pair, rmm::device_uvector> full_join( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair>, + std::unique_ptr>> + full_join(cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; private: struct hash_join_impl; diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index cb0e5bc4901..301726a978f 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -40,22 +40,24 @@ std::pair, std::unique_ptr
> get_empty_joined_table VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream) { - CUDF_EXPECTS((a.first.size() == a.second.size()), + CUDF_EXPECTS((a.first->size() == a.second->size()), "Mismatch between sizes of vectors in vector pair"); - CUDF_EXPECTS((b.first.size() == b.second.size()), + CUDF_EXPECTS((b.first->size() == b.second->size()), "Mismatch between sizes of vectors in vector pair"); - if (a.first.is_empty()) { + if (a.first->is_empty()) { return std::move(b); - } else if (b.first.is_empty()) { + } else if (b.first->is_empty()) { return std::move(a); } - auto original_size = a.first.size(); - a.first.resize(a.first.size() + b.first.size(), stream); - a.second.resize(a.second.size() + b.second.size(), stream); + auto original_size = a.first->size(); + a.first->resize(a.first->size() + b.first->size(), stream); + a.second->resize(a.second->size() + b.second->size(), stream); thrust::copy( - rmm::exec_policy(stream), b.first.begin(), b.first.end(), a.first.begin() + original_size); - thrust::copy( - rmm::exec_policy(stream), b.second.begin(), b.second.end(), a.second.begin() + original_size); + rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size); + thrust::copy(rmm::exec_policy(stream), + b.second->begin(), + b.second->end(), + a.second->begin() + original_size); return std::move(a); } @@ -83,8 +85,9 @@ struct valid_range { * * @return Pair of vectors containing the left join indices complement */ -std::pair, rmm::device_uvector> -get_left_join_indices_complement(rmm::device_uvector &right_indices, +std::pair>, + std::unique_ptr>> +get_left_join_indices_complement(std::unique_ptr> &right_indices, size_type left_table_row_count, size_type right_table_row_count, rmm::cuda_stream_view stream) @@ -92,7 +95,8 @@ get_left_join_indices_complement(rmm::device_uvector &right_indices, // Get array of indices that do not appear in right_indices // Vector allocated for unmatched result - rmm::device_uvector right_indices_complement(right_table_row_count, stream); + auto right_indices_complement = + std::make_unique>(right_table_row_count, stream); // If left table is empty in a full join call then all rows of the right table // should be represented in the joined indices. This is an optimization since @@ -101,15 +105,16 @@ get_left_join_indices_complement(rmm::device_uvector &right_indices, // produce exactly the same result as the else path but will be faster. if (left_table_row_count == 0) { thrust::sequence(rmm::exec_policy(stream), - right_indices_complement.begin(), - right_indices_complement.end(), + right_indices_complement->begin(), + right_indices_complement->end(), 0); } else { // Assume all the indices in invalid_index_map are invalid - rmm::device_uvector invalid_index_map(right_table_row_count, stream); + auto invalid_index_map = + std::make_unique>(right_table_row_count, stream); thrust::uninitialized_fill(thrust::cuda::par.on(stream.value()), - invalid_index_map.begin(), - invalid_index_map.end(), + invalid_index_map->begin(), + invalid_index_map->end(), int32_t{1}); // Functor to check for index validity since left joins can create invalid indices @@ -119,11 +124,11 @@ get_left_join_indices_complement(rmm::device_uvector &right_indices, // Thus specifying that those locations are valid thrust::scatter_if(rmm::exec_policy(stream), thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0) + right_indices.size(), - right_indices.begin(), // Index locations - right_indices.begin(), // Stencil - Check if index location is valid - invalid_index_map.begin(), // Output indices - valid); // Stencil Predicate + thrust::make_constant_iterator(0) + right_indices->size(), + right_indices->begin(), // Index locations + right_indices->begin(), // Stencil - Check if index location is valid + invalid_index_map->begin(), // Output indices + valid); // Stencil Predicate size_type begin_counter = static_cast(0); size_type end_counter = static_cast(right_table_row_count); @@ -131,17 +136,18 @@ get_left_join_indices_complement(rmm::device_uvector &right_indices, size_type indices_count = thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(begin_counter), thrust::make_counting_iterator(end_counter), - invalid_index_map.begin(), - right_indices_complement.begin(), + invalid_index_map->begin(), + right_indices_complement->begin(), thrust::identity()) - - right_indices_complement.begin(); - right_indices_complement.resize(indices_count, stream); + right_indices_complement->begin(); + right_indices_complement->resize(indices_count, stream); } - rmm::device_uvector left_invalid_indices(right_indices_complement.size(), stream); + auto left_invalid_indices = + std::make_unique>(right_indices_complement->size(), stream); thrust::fill(rmm::exec_policy(stream), - left_invalid_indices.begin(), - left_invalid_indices.end(), + left_invalid_indices->begin(), + left_invalid_indices->end(), JoinNoneValue); return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); @@ -212,20 +218,21 @@ std::unique_ptr> build_join_ * @return Join output indices vector pair. */ template -std::pair, rmm::device_uvector> probe_join_hash_table( - cudf::table_device_view build_table, - cudf::table_device_view probe_table, - multimap_type const &hash_table, - null_equality compare_nulls, - rmm::cuda_stream_view stream) +std::pair>, + std::unique_ptr>> +probe_join_hash_table(cudf::table_device_view build_table, + cudf::table_device_view probe_table, + multimap_type const &hash_table, + null_equality compare_nulls, + rmm::cuda_stream_view stream) { size_type estimated_size = estimate_join_output_size( build_table, probe_table, hash_table, compare_nulls, stream); // If the estimated output size is zero, return immediately if (estimated_size == 0) { - return std::make_pair(rmm::device_uvector{0, stream}, - rmm::device_uvector{0, stream}); + return std::make_pair(std::make_unique>(0, stream), + std::make_unique>(0, stream)); } // Because we are approximating the number of joined elements, our approximation @@ -235,12 +242,13 @@ std::pair, rmm::device_uvector> probe_ rmm::device_scalar write_index(0, stream); size_type join_size{0}; - rmm::device_uvector left_indices{0, stream}; - rmm::device_uvector right_indices{0, stream}; + auto left_indices = std::make_unique>(0, stream); + auto right_indices = std::make_unique>(0, stream); + auto current_estimated_size = estimated_size; do { - left_indices.resize(estimated_size, stream); - right_indices.resize(estimated_size, stream); + left_indices->resize(estimated_size, stream); + right_indices->resize(estimated_size, stream); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(probe_table.num_rows(), block_size); @@ -249,15 +257,16 @@ std::pair, rmm::device_uvector> probe_ row_hash hash_probe{probe_table}; row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; probe_hash_table - <<>>(hash_table, - build_table, - probe_table, - hash_probe, - equality, - left_indices.data(), - right_indices.data(), - write_index.data(), - estimated_size); + <<>>( + hash_table, + build_table, + probe_table, + hash_probe, + equality, + left_indices->data(), + right_indices->data(), + write_index.data(), + estimated_size); CHECK_CUDA(stream.value()); @@ -266,8 +275,8 @@ std::pair, rmm::device_uvector> probe_ estimated_size *= 2; } while ((current_estimated_size < join_size)); - left_indices.resize(join_size, stream); - right_indices.resize(join_size, stream); + left_indices->resize(join_size, stream); + right_indices->resize(join_size, stream); return std::make_pair(std::move(left_indices), std::move(right_indices)); } @@ -301,7 +310,8 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, _hash_table = build_join_hash_table(_build, compare_nulls, stream); } -std::pair, rmm::device_uvector> +std::pair>, + std::unique_ptr>> hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -311,7 +321,8 @@ hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, return compute_hash_join(probe, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> +std::pair>, + std::unique_ptr>> hash_join::hash_join_impl::left_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -321,7 +332,8 @@ hash_join::hash_join_impl::left_join(cudf::table_view const &probe, return compute_hash_join(probe, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> +std::pair>, + std::unique_ptr>> hash_join::hash_join_impl::full_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -332,7 +344,8 @@ hash_join::hash_join_impl::full_join(cudf::table_view const &probe, } template -std::pair, rmm::device_uvector> +std::pair>, + std::unique_ptr>> hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream, @@ -345,8 +358,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, "Mismatch in number of columns to be joined on"); if (is_trivial_join(probe, _build, JoinKind)) { - return std::make_pair(rmm::device_uvector{0, stream}, - rmm::device_uvector{0, stream}); + return std::make_pair(std::make_unique>(0, stream), + std::make_unique>(0, stream)); } CUDF_EXPECTS(std::equal(std::cbegin(_build), @@ -360,7 +373,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, } template -std::pair, rmm::device_uvector> +std::pair>, + std::unique_ptr>> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const @@ -386,7 +400,6 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, join_indices.second, probe.num_rows(), _build.num_rows(), stream); join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); } - return join_indices; } diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index cebf8fd612e..3bf20eb9433 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -184,13 +184,15 @@ size_type estimate_join_output_size(table_device_view build_table, * * @return Join output indices vector pair */ -inline std::pair, rmm::device_uvector> +inline std::pair>, + std::unique_ptr>> get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream) { - rmm::device_uvector left_indices(left.num_rows(), stream); - thrust::sequence(rmm::exec_policy(stream), left_indices.begin(), left_indices.end(), 0); - rmm::device_uvector right_indices(left.num_rows(), stream); - thrust::fill(rmm::exec_policy(stream), right_indices.begin(), right_indices.end(), JoinNoneValue); + auto left_indices = std::make_unique>(left.num_rows(), stream); + thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); + auto right_indices = std::make_unique>(left.num_rows(), stream); + thrust::fill( + rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); return std::make_pair(std::move(left_indices), std::move(right_indices)); } @@ -230,38 +232,35 @@ struct hash_join::hash_join_impl { null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - std::pair, rmm::device_uvector> inner_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::pair, rmm::device_uvector> left_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::pair, rmm::device_uvector> full_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair>, + std::unique_ptr>> + inner_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + std::pair>, + std::unique_ptr>> + left_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + std::pair>, + std::unique_ptr>> + full_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; private: template - std::pair, rmm::device_uvector> - compute_hash_join_indices(cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - template - std::pair, rmm::device_uvector> compute_hash_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair>, + std::unique_ptr>> + compute_hash_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; /** * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, @@ -279,8 +278,11 @@ struct hash_join::hash_join_impl { * @return Join output indices vector pair. */ template - std::pair, rmm::device_uvector> probe_join_indices( - cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const; + std::pair>, + std::unique_ptr>> + probe_join_indices(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream) const; }; } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index a9f4b507efc..bce72862220 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -26,12 +26,13 @@ namespace cudf { namespace detail { -std::pair, rmm::device_uvector> inner_join( - table_view const& left_input, - table_view const& right_input, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +inner_join(table_view const& left_input, + table_view const& right_input, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -50,8 +51,9 @@ std::pair, rmm::device_uvector> inner_ if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); auto result = hj_obj.inner_join(right, compare_nulls, stream, mr); - return std::make_pair, rmm::device_uvector>( - std::move(result.second), std::move(result.first)); + return std::make_pair>, + std::unique_ptr>>(std::move(result.second), + std::move(result.first)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); return hj_obj.inner_join(left, compare_nulls, stream, mr); @@ -84,14 +86,14 @@ std::unique_ptr
inner_join(table_view const& left_input, cudf::hash_join hj_obj(left.select(left_on), compare_nulls, stream); auto join_indices = hj_obj.inner_join(right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.second.begin(), - join_indices.second.end(), + join_indices.second->begin(), + join_indices.second->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.first.begin(), - join_indices.first.end(), + join_indices.first->begin(), + join_indices.first->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); @@ -100,14 +102,14 @@ std::unique_ptr
inner_join(table_view const& left_input, cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); auto join_indices = hj_obj.inner_join(left.select(left_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first.begin(), - join_indices.first.end(), + join_indices.first->begin(), + join_indices.first->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second.begin(), - join_indices.second.end(), + join_indices.second->begin(), + join_indices.second->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); @@ -115,12 +117,13 @@ std::unique_ptr
inner_join(table_view const& left_input, } } -std::pair, rmm::device_uvector> left_join( - table_view const& left_input, - table_view const& right_input, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +left_join(table_view const& left_input, + table_view const& right_input, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -164,26 +167,27 @@ std::unique_ptr
left_join(table_view const& left_input, std::move(probe_build_pair.second)); } std::unique_ptr
left_result = detail::gather(left, - join_indices.first.begin(), - join_indices.first.end(), + join_indices.first->begin(), + join_indices.first->end(), out_of_bounds_policy::NULLIFY, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second.begin(), - join_indices.second.end(), + join_indices.second->begin(), + join_indices.second->end(), out_of_bounds_policy::NULLIFY, stream, mr); return combine_table_pair(std::move(left_result), std::move(right_result)); } -std::pair, rmm::device_uvector> full_join( - table_view const& left_input, - table_view const& right_input, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +full_join(table_view const& left_input, + table_view const& right_input, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -227,14 +231,14 @@ std::unique_ptr
full_join(table_view const& left_input, std::move(probe_build_pair.second)); } std::unique_ptr
left_result = detail::gather(left, - join_indices.first.begin(), - join_indices.first.end(), + join_indices.first->begin(), + join_indices.first->end(), out_of_bounds_policy::NULLIFY, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second.begin(), - join_indices.second.end(), + join_indices.second->begin(), + join_indices.second->end(), out_of_bounds_policy::NULLIFY, stream, mr); @@ -252,40 +256,44 @@ hash_join::hash_join(cudf::table_view const& build, { } -std::pair, rmm::device_uvector> hash_join::inner_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair>, + std::unique_ptr>> +hash_join::inner_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->inner_join(probe, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> hash_join::left_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair>, + std::unique_ptr>> +hash_join::left_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->left_join(probe, compare_nulls, stream, mr); } -std::pair, rmm::device_uvector> hash_join::full_join( - cudf::table_view const& probe, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair>, + std::unique_ptr>> +hash_join::full_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->full_join(probe, compare_nulls, stream, mr); } // external APIs -std::pair, rmm::device_uvector> inner_join( - table_view const& left, - table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +inner_join(table_view const& left, + table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::inner_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); @@ -303,11 +311,12 @@ std::unique_ptr
inner_join(table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::pair, rmm::device_uvector> left_join( - table_view const& left, - table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +left_join(table_view const& left, + table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); @@ -325,11 +334,12 @@ std::unique_ptr
left_join(table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::pair, rmm::device_uvector> full_join( - table_view const& left, - table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +full_join(table_view const& left, + table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::full_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 1fcfffb96bb..ed33fab685c 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -33,7 +33,8 @@ constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; constexpr size_type JoinNoneValue = std::numeric_limits::min(); -using VectorPair = std::pair, rmm::device_uvector>; +using VectorPair = std::pair>, + std::unique_ptr>>; using multimap_type = concurrent_unordered_multimap -rmm::device_uvector left_semi_anti_join( +std::unique_ptr> left_semi_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls, @@ -49,11 +49,12 @@ rmm::device_uvector left_semi_anti_join( CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); if (is_trivial_join(left_keys, right_keys, JoinKind)) { - return rmm::device_uvector(0, stream); + return std::make_unique>(0, stream); } if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { - auto result = rmm::device_uvector(left_keys.num_rows(), stream); - thrust::sequence(thrust::cuda::par.on(stream.value()), result.begin(), result.end()); + auto result = + std::make_unique>(left_keys.num_rows(), stream); + thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end()); return std::move(result); } @@ -97,20 +98,20 @@ rmm::device_uvector left_semi_anti_join( // For semi join we want contains to be true, for anti join we want contains to be false bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); - rmm::device_uvector gather_map(left_num_rows, stream); + auto gather_map = std::make_unique>(left_num_rows, stream); // gather_map_end will be the end of valid data in gather_map auto gather_map_end = thrust::copy_if( rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(left_num_rows), - gather_map.begin(), + gather_map->begin(), [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) { auto pos = hash_table.find(idx, hash_probe, equality_probe); return (pos != hash_table.end()) == join_type_boolean; }); - auto join_size = thrust::distance(gather_map.begin(), gather_map_end); + auto join_size = thrust::distance(gather_map->begin(), gather_map_end); return std::move(gather_map); } @@ -185,8 +186,8 @@ std::unique_ptr left_semi_anti_join( auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather(left_updated, - gather_map.begin(), - gather_map.end(), + gather_map->begin(), + gather_map->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); @@ -206,10 +207,11 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -rmm::device_uvector left_semi_join(cudf::table_view const& left, - cudf::table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::unique_ptr> left_semi_join( + cudf::table_view const& left, + cudf::table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( @@ -228,10 +230,11 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -rmm::device_uvector left_anti_join(cudf::table_view const& left, - cudf::table_view const& right, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::unique_ptr> left_anti_join( + cudf::table_view const& left, + cudf::table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 1b910bfd0cc..32192234c56 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1076,11 +1076,11 @@ TEST_F(JoinTest, HashJoinSequentialProbes) auto result_table = cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, - static_cast(result.first.size()), - result.first.data()}, + static_cast(result.first->size()), + result.first->data()}, cudf::column_view{cudf::data_type{cudf::type_id::INT32}, - static_cast(result.second.size()), - result.second.data()}}); + static_cast(result.second->size()), + result.second->data()}}); auto result_sort_order = cudf::sorted_order(result_table); auto sorted_result = cudf::gather(result_table, *result_sort_order); @@ -1108,11 +1108,11 @@ TEST_F(JoinTest, HashJoinSequentialProbes) auto result = hash_join.left_join(t0); auto result_table = cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, - static_cast(result.first.size()), - result.first.data()}, + static_cast(result.first->size()), + result.first->data()}, cudf::column_view{cudf::data_type{cudf::type_id::INT32}, - static_cast(result.second.size()), - result.second.data()}}); + static_cast(result.second->size()), + result.second->data()}}); auto result_sort_order = cudf::sorted_order(result_table); auto sorted_result = cudf::gather(result_table, *result_sort_order); @@ -1140,11 +1140,11 @@ TEST_F(JoinTest, HashJoinSequentialProbes) auto result = hash_join.inner_join(t0); auto result_table = cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, - static_cast(result.first.size()), - result.first.data()}, + static_cast(result.first->size()), + result.first->data()}, cudf::column_view{cudf::data_type{cudf::type_id::INT32}, - static_cast(result.second.size()), - result.second.data()}}); + static_cast(result.second->size()), + result.second->data()}}); auto result_sort_order = cudf::sorted_order(result_table); auto sorted_result = cudf::gather(result_table, *result_sort_order); From 387a9539778ce98c4f2a30464d15a354aef3eb6b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 5 Mar 2021 11:12:25 -0500 Subject: [PATCH 067/138] Update join Cython --- python/cudf/cudf/_lib/cpp/join.pxd | 14 +++++++----- python/cudf/cudf/_lib/join.pyx | 36 ++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd index 6ebde3934c3..c221fea926d 100644 --- a/python/cudf/cudf/_lib/cpp/join.pxd +++ b/python/cudf/cudf/_lib/cpp/join.pxd @@ -10,30 +10,34 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type +from rmm._lib.device_uvector cimport device_uvector +ctypedef unique_ptr[device_uvector[size_type]] gather_map_type + cdef extern from "cudf/join.hpp" namespace "cudf" nogil: - cdef pair[unique_ptr[column], unique_ptr[column]] inner_join( + cdef pair[gather_map_type, gather_map_type] inner_join( const table_view left_keys, const table_view right_keys, ) except + - cdef pair[unique_ptr[column], unique_ptr[column]] left_join( + cdef pair[gather_map_type, gather_map_type] left_join( const table_view left_keys, const table_view right_keys, ) except + - cdef pair[unique_ptr[column], unique_ptr[column]] full_join( + cdef pair[gather_map_type, gather_map_type] full_join( const table_view left_keys, const table_view right_keys, ) except + - cdef unique_ptr[column] left_semi_join( + cdef gather_map_type left_semi_join( const table_view left_keys, const table_view right_keys, ) except + - cdef unique_ptr[column] left_anti_join( + cdef gather_map_type left_anti_join( const table_view left_keys, const table_view right_keys, ) except + diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index f1677e3f856..03ad5fbeff7 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -3,7 +3,7 @@ from collections import OrderedDict from itertools import chain -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport unique_ptr, make_unique from libcpp.utility cimport move from libcpp.vector cimport vector from libcpp.pair cimport pair @@ -13,7 +13,7 @@ from cudf._lib.column cimport Column from cudf._lib.table cimport Table, columns_from_ptr from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.types cimport size_type, data_type, type_id from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join @@ -21,7 +21,7 @@ cimport cudf._lib.cpp.join as cpp_join cpdef join(Table lhs, Table rhs, how=None): # left, inner and outer join - cdef pair[unique_ptr[column], unique_ptr[column]] c_result + cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result cdef table_view c_lhs = lhs.view() cdef table_view c_rhs = rhs.view() @@ -42,9 +42,21 @@ cpdef join(Table lhs, Table rhs, how=None): )) else: raise ValueError(f"Invalid join type {how}") + + cdef size_type join_size = c_result.first.get()[0].size() + cdef unique_ptr[column] left_rows = make_unique[column]( + data_type(type_id.INT32), + join_size, + c_result.first.get()[0].release() + ) + cdef unique_ptr[column] right_rows = make_unique[column]( + data_type(type_id.INT32), + join_size, + c_result.second.get()[0].release() + ) return ( - Column.from_unique_ptr(move(c_result.first)), - Column.from_unique_ptr(move(c_result.second)) + Column.from_unique_ptr(move(left_rows)), + Column.from_unique_ptr(move(right_rows)) ) @@ -52,7 +64,7 @@ cpdef semi_join(Table lhs, Table rhs, how=None): from cudf.core.column import as_column # left-semi and left-anti joins - cdef unique_ptr[column] c_result + cdef cpp_join.gather_map_type c_result cdef table_view c_lhs = lhs.view() cdef table_view c_rhs = rhs.view() @@ -68,4 +80,14 @@ cpdef semi_join(Table lhs, Table rhs, how=None): )) else: raise ValueError(f"Invalid join type {how}") - return Column.from_unique_ptr(move(c_result)), as_column([], dtype="int32") + + cdef size_type join_size = c_result.get()[0].size() + cdef unique_ptr[column] left_rows = make_unique[column]( + data_type(type_id.INT32), + join_size, + c_result.get()[0].release() + ) + return ( + Column.from_unique_ptr(move(left_rows)), + as_column([], dtype="int32") + ) From 6cd6433dc036a4e44e0eda417c653979db8dd116 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 5 Mar 2021 12:08:45 -0500 Subject: [PATCH 068/138] Need to resize the gathermap --- cpp/src/join/semi_join.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 3f1de67a05b..073ef9eb243 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -112,6 +112,7 @@ std::unique_ptr> left_semi_anti_join( }); auto join_size = thrust::distance(gather_map->begin(), gather_map_end); + gather_map->resize(join_size, stream); return std::move(gather_map); } From c67dcced44b1ff85756d638a6af7172847022a02 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 5 Mar 2021 12:14:50 -0500 Subject: [PATCH 069/138] Doc --- python/cudf/cudf/_lib/join.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 03ad5fbeff7..d187df8b5bd 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -19,8 +19,11 @@ from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join +# The functions below return the *gathermaps* that represent +# the join result when joining on the keys `lhs` and `rhs`. + cpdef join(Table lhs, Table rhs, how=None): - # left, inner and outer join + cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result cdef table_view c_lhs = lhs.view() cdef table_view c_rhs = rhs.view() @@ -61,6 +64,7 @@ cpdef join(Table lhs, Table rhs, how=None): cpdef semi_join(Table lhs, Table rhs, how=None): + from cudf.core.column import as_column # left-semi and left-anti joins From 30c22ed04a4c362b41ce900a43573ee332a56596 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 5 Mar 2021 13:28:35 -0500 Subject: [PATCH 070/138] Changelog --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb46ea0de4..6b08a042615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -212,7 +212,6 @@ Please see https://github.com/rapidsai/cudf/releases/tag/branch-0.19-latest for - Add static type checking via Mypy (#6381) @shwina - Update to official libcu++ on Github (#6275) @trxcllnt -# cuDF 0.17.0 (Date TBD) # cuDF 0.17.0 (10 Dec 2020) ## New Features From f73199deb820518c896375da58f636f240dfb977 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Mar 2021 16:15:51 -0500 Subject: [PATCH 071/138] Add helper to convert gather_map_type->Column --- python/cudf/cudf/_lib/join.pyx | 37 ++++++++++++++-------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index d187df8b5bd..9e21db00e5b 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -46,21 +46,9 @@ cpdef join(Table lhs, Table rhs, how=None): else: raise ValueError(f"Invalid join type {how}") - cdef size_type join_size = c_result.first.get()[0].size() - cdef unique_ptr[column] left_rows = make_unique[column]( - data_type(type_id.INT32), - join_size, - c_result.first.get()[0].release() - ) - cdef unique_ptr[column] right_rows = make_unique[column]( - data_type(type_id.INT32), - join_size, - c_result.second.get()[0].release() - ) - return ( - Column.from_unique_ptr(move(left_rows)), - Column.from_unique_ptr(move(right_rows)) - ) + cdef Column left_rows = _gather_map_as_column(move(c_result.first)) + cdef Column right_rows = _gather_map_as_column(move(c_result.second)) + return left_rows, right_rows cpdef semi_join(Table lhs, Table rhs, how=None): @@ -85,13 +73,18 @@ cpdef semi_join(Table lhs, Table rhs, how=None): else: raise ValueError(f"Invalid join type {how}") - cdef size_type join_size = c_result.get()[0].size() - cdef unique_ptr[column] left_rows = make_unique[column]( - data_type(type_id.INT32), - join_size, - c_result.get()[0].release() - ) + cdef Column left_rows = _gather_map_as_column(move(c_result)) return ( - Column.from_unique_ptr(move(left_rows)), + left_rows, as_column([], dtype="int32") ) + + +cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): + # helple to convert a gather map to a Column + cdef size_type size = gather_map.get()[0].size() + cdef unique_ptr[column] c_col = make_unique[column]( + data_type(type_id.INT32), + size, + gather_map.get()[0].release()) + return Column.from_unique_ptr(move(c_col)) From 393c06acd8488543c07ab35dd0f09c7f0e857c23 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Tue, 9 Mar 2021 16:22:56 -0500 Subject: [PATCH 072/138] Update python/cudf/cudf/core/frame.py Co-authored-by: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 090b2d848b9..f540c7144a5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2742,7 +2742,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): Parameters ---------- by: list, optional - Labels specifyin columns to sort by. By default, + Labels specifying columns to sort by. By default, sort by all columns of `self` ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. From e91f5543c7e3ceaae503e068783673a6562f34b3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Mar 2021 16:25:54 -0500 Subject: [PATCH 073/138] Cannot specify both column and index --- python/cudf/cudf/core/join/_join_helpers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 02a1a49564a..24ffe9040bf 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -29,6 +29,8 @@ class _Indexer: # >>> _Indexer("b", index=True).get(df) # returns index level "b" of df def __init__(self, name: Any, column=False, index=False): + if column and index: + raise ValueError("Cannot specify both column and index") self.name = name self.column, self.index = column, index From 01858966c7da5ec5133cc2d2d585fcf0e9a9dd74 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Mar 2021 16:57:58 -0500 Subject: [PATCH 074/138] Vaildate how --- python/cudf/cudf/tests/test_joining.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 6b969d66108..bce558aa46d 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -14,11 +14,13 @@ assert_exceptions_equal, ) +_JOIN_TYPES = {"left", "inner", "outer", "right", "leftanti", "leftsemi"} + def make_params(): np.random.seed(0) - hows = "left,inner,outer,right,leftanti,leftsemi".split(",") + hows = _JOIN_TYPES methods = "hash,sort".split(",") # Test specific cases (1) @@ -70,6 +72,8 @@ def pd_odd_joins(left, right, join_type): def assert_join_results_equal(expect, got, how, **kwargs): + if how not in _JOIN_TYPES: + raise ValueError(f"Unrecognized join type {how}") if how == "right": got = got[expect.columns] From 1eb495d7c4b988211e7340c93ffc575f9cc11f18 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 9 Mar 2021 16:59:47 -0500 Subject: [PATCH 075/138] Can't use a set --- python/cudf/cudf/tests/test_joining.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index bce558aa46d..50141428b02 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -14,7 +14,7 @@ assert_exceptions_equal, ) -_JOIN_TYPES = {"left", "inner", "outer", "right", "leftanti", "leftsemi"} +_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") def make_params(): From 4f1f0725d189b58caa4e978f2f854e61036ce6d1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Mar 2021 10:12:49 -0500 Subject: [PATCH 076/138] Avoid function local import --- python/cudf/cudf/_lib/join.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 9e21db00e5b..ee4c72483a7 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +import cudf + from collections import OrderedDict from itertools import chain @@ -53,8 +55,6 @@ cpdef join(Table lhs, Table rhs, how=None): cpdef semi_join(Table lhs, Table rhs, how=None): - from cudf.core.column import as_column - # left-semi and left-anti joins cdef cpp_join.gather_map_type c_result cdef table_view c_lhs = lhs.view() @@ -76,7 +76,7 @@ cpdef semi_join(Table lhs, Table rhs, how=None): cdef Column left_rows = _gather_map_as_column(move(c_result)) return ( left_rows, - as_column([], dtype="int32") + cudf.core.column.as_column([], dtype="int32") ) From 4aa8fec8f7973f9e0501433b52d2db9e8de8d1b7 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Mar 2021 14:14:57 -0500 Subject: [PATCH 077/138] False -> NotImplementedError --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 48b337e4738..d3bafec9a3b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1013,7 +1013,7 @@ def distinct_count( return cpp_distinct_count(self, ignore_nulls=dropna) def can_cast_safely(self, to_dtype: Dtype) -> bool: - return False + raise NotImplementedError() def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_categorical_dtype(dtype): From ae0e5f9249898849cf983fdb864cda09f2f1fb4e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Wed, 10 Mar 2021 14:19:32 -0500 Subject: [PATCH 078/138] Update cpp/include/cudf/join.hpp Co-authored-by: Jake Hemstad --- cpp/include/cudf/join.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b1f06697522..cfa717f9625 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -32,8 +32,7 @@ namespace cudf { */ /** - * @brief Returns the row indices that can be used to construct - * the result of performing an inner join between two tables. + * @brief Returns the indices of the matching rows resulting from an inner join between the specified tables. * * @code{.pseudo} * Left: {{0, 1, 2}} From f47cf7e500ce29856a57cc80f99b7c89cdc45932 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Mar 2021 18:06:21 -0500 Subject: [PATCH 079/138] Reuse some join logic --- cpp/include/cudf/table/table_view.hpp | 5 +++ cpp/src/join/join.cu | 59 +++++++++------------------ cpp/src/join/join_common_utils.hpp | 2 +- 3 files changed, 25 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 22f2073f73c..ff259ffdd6e 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -126,6 +126,11 @@ class table_view_base { */ size_type num_rows() const noexcept { return _num_rows; } + /** + * @brief Returns true if `num_columns()` returns zero, or false otherwise + */ + size_type is_empty() const noexcept { return num_columns() == 0; } + table_view_base() = default; ~table_view_base() = default; diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index bce72862220..6fedd3077b2 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -79,42 +79,21 @@ std::unique_ptr
inner_join(table_view const& left_input, auto const left = scatter_columns(matched.second.front(), left_on, left_input); auto const right = scatter_columns(matched.second.back(), right_on, right_input); - // For `inner_join`, we can freely choose either the `left` or `right` table to use for - // building/probing the hash map. Because building is typically more expensive than probing, we - // build the hash map from the smaller table. - if (right.num_rows() > left.num_rows()) { - cudf::hash_join hj_obj(left.select(left_on), compare_nulls, stream); - auto join_indices = hj_obj.inner_join(right.select(right_on), compare_nulls, stream, mr); - std::unique_ptr
left_result = detail::gather(left, - join_indices.second->begin(), - join_indices.second->end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - std::unique_ptr
right_result = detail::gather(right, - join_indices.first->begin(), - join_indices.first->end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - return combine_table_pair(std::move(left_result), std::move(right_result)); - } else { - cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); - auto join_indices = hj_obj.inner_join(left.select(left_on), compare_nulls, stream, mr); - std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - return combine_table_pair(std::move(left_result), std::move(right_result)); - } + auto join_indices = + inner_join(left.select(left_on), right.select(right_on), compare_nulls, stream, mr); + std::unique_ptr
left_result = detail::gather(left, + join_indices.first->begin(), + join_indices.first->end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second->begin(), + join_indices.second->end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } std::pair>, @@ -157,8 +136,8 @@ std::unique_ptr
left_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); - auto join_indices = hj_obj.left_join(left.select(left_on), compare_nulls, stream, mr); + auto join_indices = + left_join(left.select(left_on), right.select(right_on), compare_nulls, stream, mr); if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { @@ -221,8 +200,8 @@ std::unique_ptr
full_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - cudf::hash_join hj_obj(right.select(right_on), compare_nulls, stream); - auto join_indices = hj_obj.full_join(left.select(left_on), compare_nulls, stream, mr); + auto join_indices = + full_join(left.select(left_on), right.select(right_on), compare_nulls, stream, mr); if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index ed33fab685c..9312704f065 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -55,7 +55,7 @@ enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_AN inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) { // If there is nothing to join, then send empty table with all columns - if ((0 == left.num_columns()) || (0 == right.num_columns())) { return true; } + if (left.is_empty() || right.is_empty()) { return true; } // If left join and the left table is empty, return immediately if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } From 230ca0890fa9b1fb014bb40d3cfd9be06e1a8d3f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 10 Mar 2021 18:07:47 -0500 Subject: [PATCH 080/138] Formatting --- cpp/src/join/join.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 6fedd3077b2..8f513187ef4 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -51,9 +51,7 @@ inner_join(table_view const& left_input, if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); auto result = hj_obj.inner_join(right, compare_nulls, stream, mr); - return std::make_pair>, - std::unique_ptr>>(std::move(result.second), - std::move(result.first)); + return std::make_pair(std::move(result.second), std::move(result.first)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); return hj_obj.inner_join(left, compare_nulls, stream, mr); From 498a62123940b12f14d821c96984f6e3a9d7aa99 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 11 Mar 2021 09:33:37 -0500 Subject: [PATCH 081/138] Update cpp/include/cudf/join.hpp Co-authored-by: Jake Hemstad --- cpp/include/cudf/join.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cfa717f9625..c284573a253 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -111,6 +111,7 @@ std::unique_ptr inner_join( /** * @brief Returns the row indices that can be used to construct * the result of performing a left join between two tables. + * * For rows in the right table that do not have a match in the * left table, the row index is an unspecified out-of-bounds value. * From 2de26f3059b78bd52bd85eafa58b8d5d60cf11cf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 11 Mar 2021 10:33:45 -0500 Subject: [PATCH 082/138] Docs? --- cpp/include/cudf/join.hpp | 148 +++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cfa717f9625..7c778a17609 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -32,7 +32,13 @@ namespace cudf { */ /** - * @brief Returns the indices of the matching rows resulting from an inner join between the specified tables. + * @brief Returns a pair of row index vectors corresponding to an + * inner join between the specified tables. + * + * The first returned vector contains the row indices from the left + * table that have a match in the right table (in unspecified order). + * The corresponding values in the second returned vector are + * the matched row indices from the right table. * * @code{.pseudo} * Left: {{0, 1, 2}} @@ -46,13 +52,13 @@ namespace cudf { * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` * mismatch. * - * @param[in] left A table representing the keys of the left table of the join - * @param[in] right A table representing the keys of the right table of the join + * @param[in] left_keys The left table + * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct * the result of performing an inner join between two tables with `left_keys` and `right_keys` * as the join keys . */ @@ -109,10 +115,14 @@ std::unique_ptr inner_join( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns the row indices that can be used to construct - * the result of performing a left join between two tables. - * For rows in the right table that do not have a match in the - * left table, the row index is an unspecified out-of-bounds value. + * @brief Returns a pair of row index vectors corresponding to a + * left join between the specified tables. + * + * The first returned vector contains all the row indices from the left + * table (in unspecified order). The corresponding value in the + * second returned vector is either (1) the row index of the matched row + * from the right table, if there is a match or (2) an unspecified + * out-of-bounds value. * * @code{.pseudo} * Left: {{0, 1, 2}} @@ -126,13 +136,13 @@ std::unique_ptr inner_join( * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` * mismatch. * - * @param[in] left A table representing the keys of the left table of the join - * @param[in] right A table representing the keys of the right table of the join + * @param[in] left_keys The left table + * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct * the result of performing a left join between two tables with `left_keys` and `right_keys` * as the join keys . */ @@ -197,8 +207,13 @@ std::unique_ptr left_join( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns the row indices that can be used to construct - * the result of performing a full join between two tables. + * @brief Returns a pair of row index vectors corresponding to a + * full join between the specified tables. + * + * Taken pairwise, the values from the returned vectors are one of: + * (1) row indices corresponding to matching rows from the left and + * right tables, (2) a row index and an unspecified out-of-bounds value, + * representing a row from one table without a match in the other. * * @code{.pseudo} * Left: {{0, 1, 2}} @@ -212,13 +227,13 @@ std::unique_ptr left_join( * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` * mismatch. * - * @param[in] left A table representing the keys of the left table of the join - * @param[in] right A table representing the keys of the right table of the join + * @param[in] left The left table + * @param[in] right The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct * the result of performing a full join between two tables with `left_keys` and `right_keys` * as the join keys . */ @@ -282,6 +297,39 @@ std::unique_ptr full_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a vector of row indices corresponding to a left semi join + * between the specified tables. + * + * The returned vector contains the row indices from the left table + * for which there is a matching row in the right table. + * + * @code{.pseudo} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}} + * right_on: {1} + * Result: {1, 2} + * @endcode + * + * @throw cudf::logic_error if number of columns in either + * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE + * + * @param[in] left_keys The left table + * @param[in] right_keys The right table + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A vector `left_indices` that can be used to construct + * the result of performing a left semi join between two tables with + * `left_keys` and `right_keys` as the join keys . + */ +std::unique_ptr> left_semi_join( + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left semi join on the specified columns of two * tables (`left`, `right`) @@ -331,39 +379,31 @@ std::unique_ptr left_semi_join( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns the row indices that can be used to construct - * the result of performing a left semi join between two tables. + * @brief Returns a vector of row indices corresponding to a left anti join + * between the specified tables. + * + * The returned vector contains the row indices from the left table + * for which there is no matching row in the right table. * * @code{.pseudo} * TableA: {{0, 1, 2}} - * TableB: {{1, 2, 3}, {1, 2, 5}} - * left_on: {0} - * right_on: {1} - * Result: {1, 2} - * - * TableA {{0, 1, 2}, {1, 2, 5}} - * TableB {{1, 2, 3}} - * left_on: {0} - * right_on: {0} - * Result: {1, 2} + * TableB: {{1, 2, 3}} + * Result: {0} * @endcode * - * @throw cudf::logic_error if number of elements in `left_on` or `right_on` - * mismatch. - * @throw cudf::logic_error if number of columns in either `left` or `right` - * table is 0 or exceeds MAX_JOIN_SIZE + * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 * - * @param[in] left A table representing the keys of the left table of the join - * @param[in] right A table representing the keys of the right table of the join + * @param[in] left_keys The left table + * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A column `left_indices` that can be used to construct - * the result of performing a left semi join between two tables with + * the result of performing a left anti join between two tables with * `left_keys` and `right_keys` as the join keys . */ -std::unique_ptr> left_semi_join( +std::unique_ptr> left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, @@ -420,42 +460,6 @@ std::unique_ptr left_anti_join( null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Returns the row indices that can be used to construct - * the result of performing a left anti join between two tables. - * - * @code{.pseudo} - * TableA: {{0, 1, 2}} - * TableB: {{1, 2, 3}, {1, 2, 5}} - * left_on: {0} - * right_on: {1} - * Result: {0} - * - * TableA: {{0, 1, 2}, {1, 2, 5}} - * TableB: {{1, 2, 3}} - * left_on: {0} - * right_on: {0} - * Result: {0} - * @endcode - * - * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 - * - * @param[in] left A table representing the keys of the left table of the join - * @param[in] right A table representing the keys of the right table of the join - * @param[in] compare_nulls controls whether null join-key values - * should match or not. - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A column `left_indices` that can be used to construct - * the result of performing a left anti join between two tables with - * `left_keys` and `right_keys` as the join keys . - */ -std::unique_ptr> left_anti_join( - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Performs a cross join on two tables (`left`, `right`) * From b7d8d8aeafd1af20c7eaeff0d60abd35e96f5290 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 11 Mar 2021 12:43:22 -0500 Subject: [PATCH 083/138] Use mr --- cpp/src/join/hash_join.cu | 36 ++++++++++++++++++++---------------- cpp/src/join/hash_join.cuh | 15 +++++++++++---- cpp/src/join/join.cu | 9 +++------ cpp/src/join/semi_join.cu | 7 ++++--- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 301726a978f..aa84bc56cce 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -87,10 +87,12 @@ struct valid_range { */ std::pair>, std::unique_ptr>> -get_left_join_indices_complement(std::unique_ptr> &right_indices, - size_type left_table_row_count, - size_type right_table_row_count, - rmm::cuda_stream_view stream) +get_left_join_indices_complement( + std::unique_ptr> &right_indices, + size_type left_table_row_count, + size_type right_table_row_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { // Get array of indices that do not appear in right_indices @@ -224,15 +226,16 @@ probe_join_hash_table(cudf::table_device_view build_table, cudf::table_device_view probe_table, multimap_type const &hash_table, null_equality compare_nulls, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { size_type estimated_size = estimate_join_output_size( build_table, probe_table, hash_table, compare_nulls, stream); // If the estimated output size is zero, return immediately if (estimated_size == 0) { - return std::make_pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } // Because we are approximating the number of joined elements, our approximation @@ -242,8 +245,8 @@ probe_join_hash_table(cudf::table_device_view build_table, rmm::device_scalar write_index(0, stream); size_type join_size{0}; - auto left_indices = std::make_unique>(0, stream); - auto right_indices = std::make_unique>(0, stream); + auto left_indices = std::make_unique>(0, stream, mr); + auto right_indices = std::make_unique>(0, stream, mr); auto current_estimated_size = estimated_size; do { @@ -358,8 +361,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, "Mismatch in number of columns to be joined on"); if (is_trivial_join(probe, _build, JoinKind)) { - return std::make_pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } CUDF_EXPECTS(std::equal(std::cbegin(_build), @@ -369,7 +372,7 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, [](const auto &b, const auto &p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - return probe_join_indices(probe, compare_nulls, stream); + return probe_join_indices(probe, compare_nulls, stream, mr); } template @@ -377,11 +380,12 @@ std::pair>, std::unique_ptr>> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, null_equality compare_nulls, - rmm::cuda_stream_view stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { // Trivial left join case - exit early if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) { - return get_trivial_left_join_indices(probe, stream); + return get_trivial_left_join_indices(probe, stream, mr); } CUDF_EXPECTS(_hash_table, "Hash table of hash join is null."); @@ -393,11 +397,11 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, ? cudf::detail::join_kind::LEFT_JOIN : JoinKind; auto join_indices = cudf::detail::probe_join_hash_table( - *build_table, *probe_table, *_hash_table, compare_nulls, stream); + *build_table, *probe_table, *_hash_table, compare_nulls, stream, mr); if (JoinKind == cudf::detail::join_kind::FULL_JOIN) { auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, probe.num_rows(), _build.num_rows(), stream); + join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr); join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); } return join_indices; diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 3bf20eb9433..c502ff18260 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -181,16 +181,21 @@ size_type estimate_join_output_size(table_device_view build_table, * * @param left Table of left columns to join * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the result * * @return Join output indices vector pair */ inline std::pair>, std::unique_ptr>> -get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream) +get_trivial_left_join_indices( + table_view const& left, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - auto left_indices = std::make_unique>(left.num_rows(), stream); + auto left_indices = std::make_unique>(left.num_rows(), stream, mr); thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); - auto right_indices = std::make_unique>(left.num_rows(), stream); + auto right_indices = + std::make_unique>(left.num_rows(), stream, mr); thrust::fill( rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); return std::make_pair(std::move(left_indices), std::move(right_indices)); @@ -274,6 +279,7 @@ struct hash_join::hash_join_impl { * @param probe_table Table of probe side columns to join. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned vectors. * * @return Join output indices vector pair. */ @@ -282,7 +288,8 @@ struct hash_join::hash_join_impl { std::unique_ptr>> probe_join_indices(cudf::table_view const& probe, null_equality compare_nulls, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; }; } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 8f513187ef4..f2e4bab02c6 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -77,8 +77,7 @@ std::unique_ptr
inner_join(table_view const& left_input, auto const left = scatter_columns(matched.second.front(), left_on, left_input); auto const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = - inner_join(left.select(left_on), right.select(right_on), compare_nulls, stream, mr); + auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr); std::unique_ptr
left_result = detail::gather(left, join_indices.first->begin(), join_indices.first->end(), @@ -134,8 +133,7 @@ std::unique_ptr
left_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = - left_join(left.select(left_on), right.select(right_on), compare_nulls, stream, mr); + auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls); if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { @@ -198,8 +196,7 @@ std::unique_ptr
full_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = - full_join(left.select(left_on), right.select(right_on), compare_nulls, stream, mr); + auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls); if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 073ef9eb243..fea9ea45fd3 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -49,11 +49,11 @@ std::unique_ptr> left_semi_anti_join( CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); if (is_trivial_join(left_keys, right_keys, JoinKind)) { - return std::make_unique>(0, stream); + return std::make_unique>(0, stream, mr); } if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { auto result = - std::make_unique>(left_keys.num_rows(), stream); + std::make_unique>(left_keys.num_rows(), stream, mr); thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end()); return std::move(result); } @@ -98,7 +98,8 @@ std::unique_ptr> left_semi_anti_join( // For semi join we want contains to be true, for anti join we want contains to be false bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); - auto gather_map = std::make_unique>(left_num_rows, stream); + auto gather_map = + std::make_unique>(left_num_rows, stream, mr); // gather_map_end will be the end of valid data in gather_map auto gather_map_end = thrust::copy_if( From 9efc7614c690c106383d34bce04dd1d9931dcf63 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 15 Mar 2021 17:58:27 -0400 Subject: [PATCH 084/138] Docs --- python/cudf/cudf/core/join/join.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1377ecf5df8..698a99751d4 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -262,10 +262,11 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: # Merge the Frames `left_result` and `right_result` into a single # `Frame`, suffixing column names if necessary. - # For outer joins, the key columns from left_result and - # right_result are combined if they have the same name. - # We will drop those keys from right_result later, so - # combine them now with keys from left_result. + # If two key columns have the same name, a single output column appears + # in the result. For all other join types, the key column from the rhs + # is simply dropped. For outer joins, the two key columns are combined + # by filling nulls in the left key column with corresponding values + # from the right key column: if self.how == "outer": for lkey, rkey in zip(*self._keys): if lkey.name == rkey.name: @@ -275,11 +276,17 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: lkey.get(left_result).fillna(rkey.get(right_result)), ) - # `left_names` and `right_names` are mappings of column names - # of `lhs` and `rhs` to the corresponding column names in the result + # Compute the result column names: + # left_names and right_names will be a mappings of input column names + # to the corresponding names in the final result. left_names = OrderedDict(zip(left_result._data, left_result._data)) right_names = OrderedDict(zip(right_result._data, right_result._data)) + # For any columns from left_result and right_result that have the same + # name: + # - if they are key columns, keep only the left column + # - if they are not key columns, use suffixes to differentiate them + # in the final result common_names = set(left_names) & set(right_names) if self.on: @@ -291,9 +298,6 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: if lkey.name == rkey.name: key_columns_with_same_name.append(lkey.name) - # For any columns with the same name: - # - if they are key columns, keep only the left column - # - if they are not key columns, use suffixes for name in common_names: if name not in key_columns_with_same_name: left_names[name] = f"{name}{self.lsuffix}" From 8779bc7c16ea2304bfa0ba0d32d7962e8e10abc8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 16 Mar 2021 10:10:17 -0400 Subject: [PATCH 085/138] Simplify suffix handling --- python/cudf/cudf/core/dataframe.py | 9 +-- python/cudf/cudf/core/frame.py | 105 +++-------------------------- python/cudf/cudf/core/join/join.py | 28 +------- python/cudf/cudf/core/series.py | 19 ++++-- 4 files changed, 28 insertions(+), 133 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 18a7f052d62..0f5adf8aba1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4221,12 +4221,9 @@ def merge( else: lsuffix, rsuffix = suffixes - lhs = self.copy(deep=False) - rhs = right.copy(deep=False) - # Compute merge - gdf_result = super(DataFrame, lhs)._merge( - rhs, + gdf_result = super()._merge( + right, on=on, left_on=left_on, right_on=right_on, @@ -4234,8 +4231,6 @@ def merge( right_index=right_index, how=how, sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, method=method, indicator=indicator, suffixes=suffixes, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f540c7144a5..fb18d9c3cf9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3319,77 +3319,6 @@ def sqrt(self): """ return self._unaryop("sqrt") - @staticmethod - def _validate_merge_cfg( - lhs, - rhs, - left_on, - right_on, - on, - how, - left_index=False, - right_index=False, - lsuffix=None, - rsuffix=None, - ): - """ - Error for various combinations of merge input parameters - """ - len_left_on = len(left_on) if left_on is not None else 0 - len_right_on = len(right_on) if right_on is not None else 0 - - # must actually support the requested merge type - if how not in ["left", "inner", "outer", "leftanti", "leftsemi"]: - raise NotImplementedError(f"{how} merge not supported yet") - - # Passing 'on' with 'left_on' or 'right_on' is potentially ambiguous - if on: - if left_on or right_on: - raise ValueError( - 'Can only pass argument "on" OR "left_on" ' - 'and "right_on", not a combination of both.' - ) - - # Require same total number of columns to join on in both operands - if not (len_left_on + left_index * len(lhs.index.names)) == ( - len_right_on + right_index * len(rhs.index.names) - ): - raise ValueError( - "Merge operands must have same number of join key columns" - ) - - # If nothing specified, must have common cols to use implicitly - same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys()) - if not (left_index or right_index): - if not (left_on or right_on): - if len(same_named_columns) == 0: - raise ValueError("No common columns to perform merge on") - - for name in same_named_columns: - if not ( - name in left_on - and name in right_on - and (left_on.index(name) == right_on.index(name)) - ): - if not (lsuffix or rsuffix): - raise ValueError( - "there are overlapping columns but " - "lsuffix and rsuffix are not defined" - ) - - if on: - on_keys = [on] if not isinstance(on, list) else on - for key in on_keys: - if not (key in lhs._data.keys() and key in rhs._data.keys()): - raise KeyError(f"Key {on} not in both operands") - else: - for key in left_on: - if key not in lhs._data.keys(): - raise KeyError(f'Key "{key}" not in left operand') - for key in right_on: - if key not in rhs._data.keys(): - raise KeyError(f'Key "{key}" not in right operand') - def _merge( self, right, @@ -3400,34 +3329,24 @@ def _merge( right_index=False, how="inner", sort=False, - lsuffix=None, - rsuffix=None, method="hash", indicator=False, suffixes=("_x", "_y"), ): - # Merge doesn't support right, so just swap - if how == "right": - return right._merge( - self, - on=on, - left_on=right_on, - right_on=left_on, - left_index=right_index, - right_index=left_index, - how="left", - sort=sort, - lsuffix=rsuffix, - rsuffix=lsuffix, - method=method, - indicator=indicator, - suffixes=suffixes, - ) from cudf.core.join.join import merge + lhs, rhs = self, right + if how == "right": + # Merge doesn't support right, so just swap + how = "left" + lhs, rhs = right, self + left_on, right_on = right_on, left_on + left_index, right_index = right_index, left_index + suffixes = (suffixes[1], suffixes[0]) + return merge( - self, - right, + lhs, + rhs, on=on, left_on=left_on, right_on=right_on, @@ -3435,8 +3354,6 @@ def _merge( right_index=right_index, how=how, sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, method=method, indicator=indicator, suffixes=suffixes, diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 698a99751d4..d95a7d292f4 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -29,8 +29,6 @@ def merge( right_index, how, sort, - lsuffix, - rsuffix, method, indicator, suffixes, @@ -49,8 +47,6 @@ def merge( right_index=right_index, how=how, sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, method=method, indicator=indicator, suffixes=suffixes, @@ -91,8 +87,6 @@ def __init__( right_index, how, sort, - lsuffix, - rsuffix, method, indicator, suffixes, @@ -127,14 +121,6 @@ def __init__( sort : bool Boolean flag indicating if the output Frame is to be sorted on the output's join keys, in left to right order. - lsuffix : string - The suffix to be appended to left hand column names that - are found to exist in the right frame, but are not specified - as join keys themselves. - rsuffix : string - The suffix to be appended to right hand column names that - are found to exist in the left frame, but are not specified - as join keys themselves. suffixes : list like Left and right suffixes specified together, unpacked into lsuffix and rsuffix. @@ -148,11 +134,8 @@ def __init__( left_index=left_index, right_index=right_index, how=how, - lsuffix=lsuffix, - rsuffix=rsuffix, suffixes=suffixes, ) - self.lhs = lhs self.rhs = rhs self.on = on @@ -162,10 +145,8 @@ def __init__( self.right_index = right_index self.how = how self.sort = sort - self.lsuffix = lsuffix - self.rsuffix = rsuffix - self.suffixes = suffixes - + if suffixes: + self.lsuffix, self.rsuffix = suffixes self._compute_join_keys() @property @@ -369,8 +350,6 @@ def _validate_merge_params( left_index, right_index, how, - lsuffix, - rsuffix, suffixes, ): """ @@ -402,8 +381,7 @@ def _validate_merge_params( ): raise ValueError("No common columns to perform merge on") - if suffixes: - lsuffix, rsuffix = suffixes + lsuffix, rsuffix = suffixes for name in same_named_columns: if name == left_on == right_on: continue diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f80c6a9b452..5d9ded90a24 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4492,17 +4492,24 @@ def merge( method="hash", suffixes=("_x", "_y"), ): - if left_on not in (self.name, None): raise ValueError( "Series to other merge uses series name as key implicitly" ) - lhs = self.copy(deep=False) - rhs = other.copy(deep=False) + if lsuffix or rsuffix: + raise ValueError( + "The lsuffix and rsuffix keywords have been replaced with the " + "``suffixes=`` keyword. " + "Please provide the following instead: \n\n" + " suffixes=('%s', '%s')" + % (lsuffix or "_x", rsuffix or "_y") + ) + else: + lsuffix, rsuffix = suffixes - result = super(Series, lhs)._merge( - rhs, + result = super()._merge( + other, on=on, left_on=left_on, right_on=right_on, @@ -4510,8 +4517,6 @@ def merge( right_index=right_index, how=how, sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, method=method, indicator=False, suffixes=suffixes, From 4c651ac3899574c70e88a6b7de9e8b989420050a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 10:14:24 -0400 Subject: [PATCH 086/138] Simplify joiner requirements --- python/cudf/cudf/core/join/join.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index d95a7d292f4..76eb32f6f74 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. from __future__ import annotations +import functools from collections import OrderedDict, namedtuple from typing import TYPE_CHECKING, Callable, Tuple @@ -63,17 +64,14 @@ class Merge(object): # The joiner function must have the following signature: # - # def joiner(lhs, rhs, how=how): + # def joiner(lhs, rhs): # ... # - # where: - # - # - `lhs` and `rhs` are Frames composed of the left and right join keys - # - `how` is a string specifying the kind of join to perform - # - # ...and it returns a tuple of two gather maps representing the rows - # to gather from the left- and right- side tables respectively. - _joiner: Callable = libcudf.join.join + # where `lhs` and `rhs` are Frames composed of the left and right + # join key, and `joiner` returns a tuple of two gather maps + # representing the rows to gather from the left- and right- side + # tables respectively. + _joiner: Callable def __init__( self, @@ -136,6 +134,8 @@ def __init__( how=how, suffixes=suffixes, ) + self._joiner = functools.partial(libcudf.join.join, how=how) + self.lhs = lhs self.rhs = rhs self.on = on @@ -434,7 +434,11 @@ def _restore_categorical_keys( class MergeSemi(Merge): - _joiner = libcudf.join.semi_join + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._joiner = functools.partial( + libcudf.join.semi_join, how=kwargs["how"] + ) def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: return super()._merge_results(lhs, cudf.core.frame.Frame()) From b4f4d7c85b0c72220099e3516c693fb451a51b23 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 10:31:35 -0400 Subject: [PATCH 087/138] Do less work in SemiJoin._merge_results --- python/cudf/cudf/core/join/join.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 76eb32f6f74..3ca0111c74d 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -441,4 +441,7 @@ def __init__(self, *args, **kwargs): ) def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: - return super()._merge_results(lhs, cudf.core.frame.Frame()) + if issubclass(self._out_class, cudf.Index): + return self._out_class._from_data(lhs) + else: + return self._out_class._from_data(lhs._data, index=lhs._index) From d353c92c510a5cad9d11b90986fb13a9df9f80df Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 10:33:36 -0400 Subject: [PATCH 088/138] Doc --- python/cudf/cudf/core/join/join.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 3ca0111c74d..20eb5101d4c 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -441,6 +441,7 @@ def __init__(self, *args, **kwargs): ) def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: + # semi-join result includes only lhs columns if issubclass(self._out_class, cudf.Index): return self._out_class._from_data(lhs) else: From 580a346d95e218b173b10a0f37596e5cfa3a949b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 10:35:37 -0400 Subject: [PATCH 089/138] Doc --- python/cudf/cudf/core/join/join.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 20eb5101d4c..f799171c2a8 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -151,6 +151,7 @@ def __init__( @property def _out_class(self): + # type of the result out_class = cudf.DataFrame if isinstance(self.lhs, cudf.MultiIndex) or isinstance( From 328dafdb548f6dfdf47df4b60d67652a4b8c971d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 11:40:03 -0400 Subject: [PATCH 090/138] Return None from semi_join --- python/cudf/cudf/_lib/join.pyx | 4 +--- python/cudf/cudf/core/join/join.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index ee4c72483a7..69b8004cede 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -25,7 +25,6 @@ cimport cudf._lib.cpp.join as cpp_join # the join result when joining on the keys `lhs` and `rhs`. cpdef join(Table lhs, Table rhs, how=None): - cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result cdef table_view c_lhs = lhs.view() cdef table_view c_rhs = rhs.view() @@ -54,7 +53,6 @@ cpdef join(Table lhs, Table rhs, how=None): cpdef semi_join(Table lhs, Table rhs, how=None): - # left-semi and left-anti joins cdef cpp_join.gather_map_type c_result cdef table_view c_lhs = lhs.view() @@ -76,7 +74,7 @@ cpdef semi_join(Table lhs, Table rhs, how=None): cdef Column left_rows = _gather_map_as_column(move(c_result)) return ( left_rows, - cudf.core.column.as_column([], dtype="int32") + None ) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index f799171c2a8..0d45abb76c2 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -64,11 +64,14 @@ class Merge(object): # The joiner function must have the following signature: # - # def joiner(lhs, rhs): + # def joiner( + # lhs: Frame, + # rhs: Frame + # ) -> Tuple[Optional[Column], Optional[Column]]: # ... # # where `lhs` and `rhs` are Frames composed of the left and right - # join key, and `joiner` returns a tuple of two gather maps + # join key. The `joiner` returns a tuple of two Columns # representing the rows to gather from the left- and right- side # tables respectively. _joiner: Callable @@ -173,8 +176,13 @@ def perform_merge(self) -> Frame: ) lhs, rhs = self._restore_categorical_keys(lhs, rhs) - left_result = lhs._gather(left_rows, nullify=True) - right_result = rhs._gather(right_rows, nullify=True) + left_result = cudf.core.frame.Frame() + right_result = cudf.core.frame.Frame() + + if left_rows is not None: + left_result = lhs._gather(left_rows, nullify=True) + if right_rows is not None: + right_result = rhs._gather(right_rows, nullify=True) result = self._merge_results(left_result, right_result) From 297d20abe71c5463557eca071573fda35046166a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 17 Mar 2021 13:06:28 -0400 Subject: [PATCH 091/138] Init common_type --- python/cudf/cudf/core/join/_join_helpers.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 24ffe9040bf..209a6c5c4e4 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -74,8 +74,9 @@ def _frame_select_by_indexers( def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: - # cast the keys lcol and rcol to a common dtype + common_type = None + # cast the keys lcol and rcol to a common dtype ltype = lcol.dtype rtype = rcol.dtype @@ -109,10 +110,7 @@ def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: "upcasting to {common_type}." ) - if common_type: - return common_type - - return None + return common_type def _match_categorical_dtypes(ltype: Dtype, rtype: Dtype, how: str) -> Dtype: From 935648b01220deac0dc306014ecd82f5191fc6f5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 13:48:34 -0700 Subject: [PATCH 092/138] Move validation directly into set_by_label and use a raw dict to store the columns in the accessor. --- python/cudf/cudf/core/column_accessor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index ad1a0c80ef5..a1de373eb37 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,7 +3,6 @@ from __future__ import annotations import itertools -from collections import OrderedDict from collections.abc import MutableMapping from typing import ( TYPE_CHECKING, @@ -18,8 +17,8 @@ import pandas as pd import cudf +from cudf.core import column from cudf.utils.utils import ( - OrderedColumnDict, cached_property, to_flat_dict, to_nested_dict, @@ -31,7 +30,7 @@ class ColumnAccessor(MutableMapping): - _data: "OrderedDict[Any, ColumnBase]" + _data: "dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] @@ -64,7 +63,7 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = OrderedColumnDict(data) + self._data = dict(data) self.multiindex = multiindex self._level_names = level_names @@ -280,6 +279,15 @@ def set_by_label(self, key: Any, value: Any): value : column-like """ key = self._pad_key(key) + + # Convert all types to columns and ensure that values are of equal + # length. + value = column.as_column(value) + if len(self._data) > 0: + first = next(iter(self._data.values())) + if len(value) != len(first): + raise ValueError("All columns must be of equal length") + self._data[key] = value self._clear_cache() From 806a3ef7740414bf16e21ea8b112982537a6f5ad Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 13:54:27 -0700 Subject: [PATCH 093/138] Remove all references to OrderedColumnDict. --- python/cudf/cudf/_lib/table.pyx | 8 ++++---- python/cudf/cudf/core/dataframe.py | 3 +-- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/utils/utils.py | 30 ------------------------------ 4 files changed, 7 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index dba0abb9cf0..f97b45d8abf 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -34,8 +34,8 @@ cdef class Table: Parameters ---------- - data : OrderedColumnDict - An OrderedColumnDict mapping column names to Columns + data : dict + An dict mapping column names to Columns index : Table A Table representing the (optional) index columns. """ @@ -109,7 +109,7 @@ cdef class Table: it += 1 index = Table(dict(zip(index_names, index_columns))) - # Construct the data OrderedColumnDict + # Construct the data dict data_columns = [] for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) @@ -154,7 +154,7 @@ cdef class Table: column_idx += 1 index = Table(dict(zip(index_names, index_columns))) - # Construct the data OrderedColumnDict + # Construct the data dict cdef size_type source_column_idx = 0 data_columns = [] for _ in column_names: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 18a7f052d62..a04dbb826a8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -52,7 +52,6 @@ is_struct_dtype, numeric_normalize_types, ) -from cudf.utils.utils import OrderedColumnDict T = TypeVar("T", bound="DataFrame") @@ -4599,7 +4598,7 @@ def hash_columns(self, columns=None): table_to_hash = self else: cols = [self[k]._column for k in columns] - table_to_hash = Frame(data=OrderedColumnDict(zip(columns, cols))) + table_to_hash = Frame(data=dict(zip(columns, cols))) return Series(table_to_hash._hash()).values diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 926aad368b0..e33fda3ee09 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -39,8 +39,8 @@ class Frame(libcudf.table.Table): Parameters ---------- - data : OrderedColumnDict - An OrderedColumnDict mapping column names to Columns + data : dict + An dict mapping column names to Columns index : Table A Frame representing the (optional) index columns. """ diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 03a39f6fb4b..ba9fa734248 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -280,36 +280,6 @@ def __get__(self, instance, cls): return value -class ColumnValuesMappingMixin: - """ - Coerce provided values for the mapping to Columns. - """ - - def __setitem__(self, key, value): - - value = column.as_column(value) - super().__setitem__(key, value) - - -class EqualLengthValuesMappingMixin: - """ - Require all values in the mapping to have the same length. - """ - - def __setitem__(self, key, value): - if len(self) > 0: - first = next(iter(self.values())) - if len(value) != len(first): - raise ValueError("All values must be of equal length") - super().__setitem__(key, value) - - -class OrderedColumnDict( - ColumnValuesMappingMixin, EqualLengthValuesMappingMixin, OrderedDict -): - pass - - class NestedMappingMixin: """ Make missing values of a mapping empty instances From 40a7b173bb3a86bd8d2473121aa519765c442e7f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 13:58:35 -0700 Subject: [PATCH 094/138] Move validation to separate method and use in both set_by_label and constructor. --- python/cudf/cudf/core/column_accessor.py | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index a1de373eb37..c6b9236f0d0 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -63,7 +63,7 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = dict(data) + self._data = {k: self._convert_and_validate(v) for k, v in data.items()} self.multiindex = multiindex self._level_names = level_names @@ -269,6 +269,18 @@ def select_by_index(self, index: Any) -> ColumnAccessor: data, multiindex=self.multiindex, level_names=self.level_names, ) + def _convert_and_validate(self, value: Any): + # Make sure that the provided value can be stored as a column. This + # method will convert the column to an appropriate type and make sure + # that it is the same type as other columns in the accessor. + + value = column.as_column(value) + if len(self._data) > 0: + first = next(iter(self._data.values())) + if len(value) != len(first): + raise ValueError("All columns must be of equal length") + return value + def set_by_label(self, key: Any, value: Any): """ Add (or modify) column by name. @@ -279,16 +291,7 @@ def set_by_label(self, key: Any, value: Any): value : column-like """ key = self._pad_key(key) - - # Convert all types to columns and ensure that values are of equal - # length. - value = column.as_column(value) - if len(self._data) > 0: - first = next(iter(self._data.values())) - if len(value) != len(first): - raise ValueError("All columns must be of equal length") - - self._data[key] = value + self._data[key] = self._convert_and_validate(value) self._clear_cache() def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: From a1c576ebe2cb9e5e344a9aaa00a7c2ef4044c5c6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:02:25 -0700 Subject: [PATCH 095/138] Format with black. --- python/cudf/cudf/core/column_accessor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c6b9236f0d0..fe8058c31ce 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -63,7 +63,9 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = {k: self._convert_and_validate(v) for k, v in data.items()} + self._data = { + k: self._convert_and_validate(v) for k, v in data.items() + } self.multiindex = multiindex self._level_names = level_names From 788d9d6a0bd1e8254dba31b3e085ed56abec0160 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:08:21 -0700 Subject: [PATCH 096/138] Expose parameter to make validation optional. --- python/cudf/cudf/core/column_accessor.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index fe8058c31ce..38832396f1f 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -283,17 +283,24 @@ def _convert_and_validate(self, value: Any): raise ValueError("All columns must be of equal length") return value - def set_by_label(self, key: Any, value: Any): + def set_by_label(self, key: Any, value: Any, validate: bool = True): """ Add (or modify) column by name. Parameters ---------- - key : name of the column + key + name of the column value : column-like + The value to insert into the column. + validate : bool + If True, the provided value will be coerced to a column and + validated before setting (Default value = True). """ key = self._pad_key(key) - self._data[key] = self._convert_and_validate(value) + if validate: + value = self._convert_and_validate(value) + self._data[key] = value self._clear_cache() def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: From 6a64285f4e36a7437c550cf57ccd30dcf694f2e0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:13:55 -0700 Subject: [PATCH 097/138] Coerce constructor input to dict before calling items. --- python/cudf/cudf/core/column_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 38832396f1f..034c74393b1 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -64,7 +64,7 @@ def __init__( self._level_names = level_names self._data = { - k: self._convert_and_validate(v) for k, v in data.items() + k: self._convert_and_validate(v) for k, v in dict(data).items() } self.multiindex = multiindex self._level_names = level_names From e7d09812a3ee18a3ce8cbe3208c9a526d353a38d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 14:21:47 -0700 Subject: [PATCH 098/138] Make construction safe. --- python/cudf/cudf/core/column_accessor.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 034c74393b1..2b5ed21b010 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -63,9 +63,12 @@ def __init__( self.multiindex = multiindex self._level_names = level_names - self._data = { - k: self._convert_and_validate(v) for k, v in dict(data).items() - } + # Explicitly initialize an empty data dict so that we can validate each + # new column. + self._data = {} + for k, v in dict(data).items(): + self._data[k] = self._convert_and_validate(v) + self.multiindex = multiindex self._level_names = level_names @@ -277,7 +280,7 @@ def _convert_and_validate(self, value: Any): # that it is the same type as other columns in the accessor. value = column.as_column(value) - if len(self._data) > 0: + if hasattr(self, '_data') and len(self._data) > 0: first = next(iter(self._data.values())) if len(value) != len(first): raise ValueError("All columns must be of equal length") From c39932c4dc46e35c262440b76d16d0ed3733e8c3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 15:39:01 -0700 Subject: [PATCH 099/138] Final cleanup and documentation. --- python/cudf/cudf/core/column_accessor.py | 52 ++++++++++++++---------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 2b5ed21b010..c175a6d9da7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -33,6 +33,7 @@ class ColumnAccessor(MutableMapping): _data: "dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] + _column_length: int def __init__( self, @@ -62,15 +63,30 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names + self._column_length = column_length + else: + # This code path is performance-critical for copies and should be + # modified with care. + self._data = {} + if data: + data = dict(data) + # Faster than next(iter(data.values())) + column_length = len(data[next(iter(data))]) + for k, v in data.items(): + # Much faster to avoid the function call if possible; the + # extra isinstance is negligible if we do have to make a + # column from something else. + if not isinstance(v, column.ColumnBase): + v = column.as_column(v) + if len(v) != column_length: + raise ValueError("All columns must be of equal length") + self._data[k] = v + self._column_length = column_length + else: + self._column_length = None - # Explicitly initialize an empty data dict so that we can validate each - # new column. - self._data = {} - for k, v in dict(data).items(): - self._data[k] = self._convert_and_validate(v) - - self.multiindex = multiindex - self._level_names = level_names + self.multiindex = multiindex + self._level_names = level_names def __iter__(self): return self._data.__iter__() @@ -274,18 +290,6 @@ def select_by_index(self, index: Any) -> ColumnAccessor: data, multiindex=self.multiindex, level_names=self.level_names, ) - def _convert_and_validate(self, value: Any): - # Make sure that the provided value can be stored as a column. This - # method will convert the column to an appropriate type and make sure - # that it is the same type as other columns in the accessor. - - value = column.as_column(value) - if hasattr(self, '_data') and len(self._data) > 0: - first = next(iter(self._data.values())) - if len(value) != len(first): - raise ValueError("All columns must be of equal length") - return value - def set_by_label(self, key: Any, value: Any, validate: bool = True): """ Add (or modify) column by name. @@ -302,7 +306,13 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True): """ key = self._pad_key(key) if validate: - value = self._convert_and_validate(value) + value = column.as_column(value) + if len(self._data) > 0: + if len(value) != self._column_length: + raise ValueError("All columns must be of equal length") + else: + self._column_length = len(value) + self._data[key] = value self._clear_cache() From 4ff09fcf66566ac9aaf3b6df75cf2c60e96c060e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Mar 2021 15:57:35 -0700 Subject: [PATCH 100/138] Address style issues. --- python/cudf/cudf/core/column_accessor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c175a6d9da7..6988efeafa7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -8,6 +8,7 @@ TYPE_CHECKING, Any, Callable, + Dict, Mapping, Optional, Tuple, @@ -30,7 +31,7 @@ class ColumnAccessor(MutableMapping): - _data: "dict[Any, ColumnBase]" + _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] _column_length: int @@ -63,7 +64,7 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._column_length = column_length + self._column_length = data._column_length else: # This code path is performance-critical for copies and should be # modified with care. @@ -82,8 +83,6 @@ def __init__( raise ValueError("All columns must be of equal length") self._data[k] = v self._column_length = column_length - else: - self._column_length = None self.multiindex = multiindex self._level_names = level_names From 0178127205b44383bf4d6a2c3d424512aa80b033 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 17:28:57 -0400 Subject: [PATCH 101/138] CA fix --- python/cudf/cudf/core/column_accessor.py | 29 ++++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 84a21d78266..bd3e801fbec 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -19,11 +19,7 @@ import cudf from cudf.core import column -from cudf.utils.utils import ( - cached_property, - to_flat_dict, - to_nested_dict, -) +from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -34,7 +30,6 @@ class ColumnAccessor(MutableMapping): _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] - _column_length: int def __init__( self, @@ -64,15 +59,13 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._column_length = data._column_length else: # This code path is performance-critical for copies and should be # modified with care. self._data = {} if data: data = dict(data) - # Faster than next(iter(data.values())) - column_length = len(data[next(iter(data))]) + column_length = _length_of_first_value(data) for k, v in data.items(): # Much faster to avoid the function call if possible; the # extra isinstance is negligible if we do have to make a @@ -82,8 +75,6 @@ def __init__( if len(v) != column_length: raise ValueError("All columns must be of equal length") self._data[k] = v - self._column_length = column_length - self.multiindex = multiindex self._level_names = level_names @@ -144,6 +135,10 @@ def nrows(self) -> int: else: return len(next(iter(self.values()))) + @cached_property + def _column_length(self) -> int: + return _length_of_first_value(self._data) + @cached_property def names(self) -> Tuple[Any, ...]: return tuple(self.keys()) @@ -164,7 +159,12 @@ def _grouped_data(self) -> MutableMapping: return self._data def _clear_cache(self): - cached_properties = "columns", "names", "_grouped_data" + cached_properties = ( + "columns", + "names", + "_grouped_data", + "_column_length", + ) for attr in cached_properties: try: self.__delattr__(attr) @@ -473,3 +473,8 @@ def _compare_keys(target: Any, key: Any) -> bool: if k1 != k2: return False return True + + +def _length_of_first_value(data: Dict[Any, Any]) -> int: + # faster than next(iter(data.values())): + return 0 if not data else len(data[next(iter(data))]) From c8d23641df2ac735725523bc19bad8dc25d36a6e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 17:48:32 -0400 Subject: [PATCH 102/138] Don't validate on gathers --- python/cudf/cudf/core/join/join.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0d45abb76c2..862c50d8ca7 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -299,9 +299,13 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: data = left_result._data.__class__() for lcol in left_names: - data[left_names[lcol]] = left_result._data[lcol] + data.set_by_label( + left_names[lcol], left_result._data[lcol], validate=False + ) for rcol in right_names: - data[right_names[rcol]] = right_result._data[rcol] + data.set_by_label( + right_names[rcol], right_result._data[rcol], validate=False + ) # Index of the result: if self.left_index and self.right_index: From efea63dd02a6d143a72bc8e76d012d24a27a8af6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 17:53:07 -0400 Subject: [PATCH 103/138] Prioritize numeric columns --- python/cudf/cudf/core/column/column.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b2b2874eeb4..dd06d97d105 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1017,7 +1017,9 @@ def distinct_count( return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: - if is_categorical_dtype(dtype): + if is_numerical_dtype(dtype): + return self.as_numerical_column(dtype) + elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in { np.str_, @@ -1548,6 +1550,16 @@ def build_column( """ dtype = pd.api.types.pandas_dtype(dtype) + if is_numerical_dtype(dtype): + assert data is not None + return cudf.core.column.NumericalColumn( + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + ) if is_categorical_dtype(dtype): if not len(children) == 1: raise ValueError( @@ -1634,15 +1646,7 @@ def build_column( children=children, ) else: - assert data is not None - return cudf.core.column.NumericalColumn( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) + raise TypeError(f"Unrecognized dtype: {dtype}") def build_categorical_column( From c3b6444787e29a4536104adad1dc3508b7e5a9dd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 14:56:08 -0700 Subject: [PATCH 104/138] Lazily compute and delete column length on demand. --- python/cudf/cudf/core/column_accessor.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 84a21d78266..f0677618d76 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -64,7 +64,6 @@ def __init__( self._data = data._data self.multiindex = multiindex self._level_names = level_names - self._column_length = data._column_length else: # This code path is performance-critical for copies and should be # modified with care. @@ -82,7 +81,6 @@ def __init__( if len(v) != column_length: raise ValueError("All columns must be of equal length") self._data[k] = v - self._column_length = column_length self.multiindex = multiindex self._level_names = level_names @@ -163,6 +161,13 @@ def _grouped_data(self) -> MutableMapping: else: return self._data + @cached_property + def _column_length(self): + try: + return len(self._data[next(iter(self._data))]) + except StopIteration: + return 0 + def _clear_cache(self): cached_properties = "columns", "names", "_grouped_data" for attr in cached_properties: @@ -171,6 +176,10 @@ def _clear_cache(self): except AttributeError: pass + # Column length should only be cleared if no data is present. + if len(self._data) == 0 and hasattr(self, "_column_length"): + del self._column_length + def to_pandas_index(self) -> pd.Index: """" Convert the keys of the ColumnAccessor to a Pandas Index object. From 01b2cf572d596da54423fdff36beefe1da382bb3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 14:59:42 -0700 Subject: [PATCH 105/138] Remove redundant clear cache in setitem. --- python/cudf/cudf/core/column_accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index f0677618d76..77445dae3c7 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -93,7 +93,6 @@ def __getitem__(self, key: Any) -> ColumnBase: def __setitem__(self, key: Any, value: Any): self.set_by_label(key, value) - self._clear_cache() def __delitem__(self, key: Any): self._data.__delitem__(key) From 88992581ec04b4092e9ef02edbd03350c31af0fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 15:06:38 -0700 Subject: [PATCH 106/138] Remove mypy annotation for column length. --- python/cudf/cudf/core/column_accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 77445dae3c7..44484927985 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -34,7 +34,6 @@ class ColumnAccessor(MutableMapping): _data: "Dict[Any, ColumnBase]" multiindex: bool _level_names: Tuple[Any, ...] - _column_length: int def __init__( self, From c6cd41528d008d5fce038bc3c45484a12fca304f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 18:54:03 -0400 Subject: [PATCH 107/138] Optimize casting logic --- python/cudf/cudf/core/column/categorical.py | 3 ++ python/cudf/cudf/core/join/_join_helpers.py | 50 ++++++++++++--------- python/cudf/cudf/core/join/join.py | 11 +++-- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 39c278d2abf..bb1bf3c5d5c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -750,6 +750,9 @@ def _set_categories( ordered=ordered, ) + def _decategorize(self) -> ColumnBase: + return self._column._get_decategorized_column() + class CategoricalColumn(column.ColumnBase): """Implements operations for Columns of Categorical type diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 209a6c5c4e4..544bc385358 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Any, Iterable +from typing import TYPE_CHECKING, Any, Iterable, Tuple import numpy as np import pandas as pd @@ -11,8 +11,7 @@ from cudf.core.dtypes import CategoricalDtype if TYPE_CHECKING: - from cudf._typing import Dtype - from cudf.core.column import ColumnBase + from cudf.core.column import CategoricalColumn, ColumnBase from cudf.core.frame import Frame @@ -73,7 +72,13 @@ def _frame_select_by_indexers( return result -def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: +def _match_join_keys( + lcol: ColumnBase, rcol: ColumnBase, how: str +) -> Tuple[ColumnBase, ColumnBase]: + # returns the common dtype that lcol and rcol should be casted to, + # before they can be used as left and right join keys. + # If no casting is necessary, returns None + common_type = None # cast the keys lcol and rcol to a common dtype @@ -84,10 +89,10 @@ def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: if isinstance(ltype, CategoricalDtype) or isinstance( rtype, CategoricalDtype ): - return _match_categorical_dtypes(ltype, rtype, how) + return _match_categorical_dtypes(lcol, rcol, how) if pd.api.types.is_dtype_equal(ltype, rtype): - return ltype + return lcol, rcol if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): common_type = ( @@ -103,45 +108,50 @@ def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase, how: str) -> Dtype: if how == "left": if rcol.fillna(0).can_cast_safely(ltype): - return ltype + return lcol, rcol.astype(ltype) else: warnings.warn( f"Can't safely cast column from {rtype} to {ltype}, " "upcasting to {common_type}." ) - return common_type + return lcol.astype(common_type), rcol.astype(common_type) -def _match_categorical_dtypes(ltype: Dtype, rtype: Dtype, how: str) -> Dtype: +def _match_categorical_dtypes( + lcol: ColumnBase, rcol: ColumnBase, how: str +) -> Tuple[ColumnBase, ColumnBase]: # cast the keys lcol and rcol to a common dtype # when at least one of them is a categorical type + ltype, rtype = lcol.dtype, rcol.dtype - if isinstance(ltype, CategoricalDtype) and isinstance( - rtype, CategoricalDtype + if isinstance(lcol, CategoricalColumn) and isinstance( + rcol, CategoricalColumn ): # if both are categoricals, logic is complicated: - return _match_categorical_dtypes_both(ltype, rtype, how) + return _match_categorical_dtypes_both(lcol, rcol, how) if isinstance(ltype, CategoricalDtype): if how in {"left", "leftsemi", "leftanti"}: - return ltype + return lcol, rcol.astype(ltype) common_type = ltype.categories.dtype elif isinstance(rtype, CategoricalDtype): common_type = rtype.categories.dtype - return common_type + return lcol.astype(common_type), rcol.astype(common_type) def _match_categorical_dtypes_both( - ltype: CategoricalDtype, rtype: CategoricalDtype, how: str -) -> Dtype: + lcol: CategoricalColumn, rcol: CategoricalColumn, how: str +) -> Tuple[ColumnBase, ColumnBase]: # The commontype depends on both `how` and the specifics of the # categorical variables to be merged. + ltype, rtype = lcol.dtype, rcol.dtype + # when both are ordered and both have the same categories, # no casting required: if ltype == rtype: - return ltype + return lcol, rcol # Merging categorical variables when only one side is ordered is # ambiguous and not allowed. @@ -167,11 +177,11 @@ def _match_categorical_dtypes_both( if how == "inner": # cast to category types -- we must cast them back later return _match_join_keys( - ltype.categories._values, rtype.categories._values, how + lcol.cat()._decategorize(), rcol.cat()._decategorize(), how, ) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type - return ltype + return lcol, rcol.astype(ltype) else: # merge categories merged_categories = cudf.concat( @@ -180,7 +190,7 @@ def _match_categorical_dtypes_both( common_type = cudf.CategoricalDtype( categories=merged_categories, ordered=False ) - return common_type + return lcol.astype(common_type), rcol.astype(common_type) def _coerce_to_tuple(obj): diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 862c50d8ca7..6e35f35f1c3 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -416,10 +416,13 @@ def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: out_rhs = rhs.copy(deep=False) for left_key, right_key in zip(*self._keys): lcol, rcol = left_key.get(lhs), right_key.get(rhs) - dtype = _match_join_keys(lcol, rcol, how=self.how) - if dtype: - left_key.set(out_lhs, lcol.astype(dtype)) - right_key.set(out_rhs, rcol.astype(dtype)) + lcol_casted, rcol_casted = _match_join_keys( + lcol, rcol, how=self.how + ) + if lcol is not lcol_casted: + left_key.set(out_lhs, lcol_casted) + if rcol is not rcol_casted: + right_key.set(out_rhs, rcol_casted) return out_lhs, out_rhs def _restore_categorical_keys( From 7f8e1cd60525f3a06e064f8fa4bc4d93bb383700 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:01:50 -0400 Subject: [PATCH 108/138] Undo --- python/cudf/cudf/core/column_accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 2582f7be287..a527713099f 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -167,7 +167,6 @@ def _clear_cache(self): "columns", "names", "_grouped_data", - "_column_length", ) for attr in cached_properties: try: From f2e4609f63389ba44b65284feaba155d4ba9721a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:04:48 -0400 Subject: [PATCH 109/138] Don't validate when copying type metadata --- python/cudf/cudf/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e6898b8c606..ecff3dee573 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2408,7 +2408,9 @@ def _copy_type_metadata( for name, col, other_col in zip( self._data.keys(), self._data.values(), other._data.values() ): - self._data[name] = other_col._copy_type_metadata(col) + self._data.set_by_label( + name, other_col._copy_type_metadata(col), validate=False + ) if include_index: if self._index is not None and other._index is not None: From 83cc407ced4eb88cc399c6cae5cd7ee3bae55c5d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:11:48 -0400 Subject: [PATCH 110/138] ImportError --- python/cudf/cudf/core/join/_join_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 544bc385358..1fb380f8697 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -125,8 +125,8 @@ def _match_categorical_dtypes( # when at least one of them is a categorical type ltype, rtype = lcol.dtype, rcol.dtype - if isinstance(lcol, CategoricalColumn) and isinstance( - rcol, CategoricalColumn + if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance( + rcol, cudf.core.column.CategoricalColumn ): # if both are categoricals, logic is complicated: return _match_categorical_dtypes_both(lcol, rcol, how) From 72598fbcacb046d7485b74f1e801772f4006a526 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:33:43 -0400 Subject: [PATCH 111/138] Prioritize numeric dtypes in is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1438421bb12..375eccce310 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,16 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if is_categorical_dtype(obj): - return False - if is_list_dtype(obj): + if np.issubdtype(obj, np.bool_): + return True + elif np.issubdtype(obj, np.floating): + return True + elif np.issubdtype(obj, np.signedinteger): + return True + elif np.issubdtype(obj, np.unsignedinteger): + return True + else: return False - return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - or np.issubdtype(obj, np.unsignedinteger) - ) def is_string_dtype(obj): From fa220b6d86801ed98c415d48305eb74c0afc9d2e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:47:20 -0400 Subject: [PATCH 112/138] Add unsafe CA ctor --- python/cudf/cudf/_lib/table.pyx | 4 +++- python/cudf/cudf/core/column_accessor.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index f97b45d8abf..8b83de1e31c 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -114,7 +114,9 @@ cdef class Table: for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) it += 1 - data = dict(zip(column_names, data_columns)) + data = ColumnAccessor._init_unsafe( + dict(zip(column_names, data_columns)) + ) return Table(data=data, index=index) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index a527713099f..50c7dbd8812 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -80,6 +80,19 @@ def __init__( self.multiindex = multiindex self._level_names = level_names + @classmethod + def _init_unsafe( + cls, + data: Dict[Any, ColumnBase], + multiindex: bool = False, + level_names=None, + ) -> ColumnAccessor: + obj = cls() + obj._data = data + obj.multiindex = multiindex + obj._level_names = level_names + return obj + def __iter__(self): return self._data.__iter__() From f7dc417dec0519aca0d866b7f0dedcb9ad3a2d05 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:48:15 -0400 Subject: [PATCH 113/138] Revert "Prioritize numeric dtypes in is_numerical_dtype" This reverts commit 72598fbcacb046d7485b74f1e801772f4006a526. --- python/cudf/cudf/utils/dtypes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 375eccce310..1438421bb12 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,16 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if np.issubdtype(obj, np.bool_): - return True - elif np.issubdtype(obj, np.floating): - return True - elif np.issubdtype(obj, np.signedinteger): - return True - elif np.issubdtype(obj, np.unsignedinteger): - return True - else: + if is_categorical_dtype(obj): + return False + if is_list_dtype(obj): return False + return ( + np.issubdtype(obj, np.bool_) + or np.issubdtype(obj, np.floating) + or np.issubdtype(obj, np.signedinteger) + or np.issubdtype(obj, np.unsignedinteger) + ) def is_string_dtype(obj): From 3760077f4f004129ca099b9d0ce8861bf3d87520 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 22 Mar 2021 19:51:47 -0400 Subject: [PATCH 114/138] Revert "Prioritize numeric dtypes in is_numerical_dtype" This reverts commit 72598fbcacb046d7485b74f1e801772f4006a526. --- python/cudf/cudf/utils/dtypes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 375eccce310..1438421bb12 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,16 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if np.issubdtype(obj, np.bool_): - return True - elif np.issubdtype(obj, np.floating): - return True - elif np.issubdtype(obj, np.signedinteger): - return True - elif np.issubdtype(obj, np.unsignedinteger): - return True - else: + if is_categorical_dtype(obj): + return False + if is_list_dtype(obj): return False + return ( + np.issubdtype(obj, np.bool_) + or np.issubdtype(obj, np.floating) + or np.issubdtype(obj, np.signedinteger) + or np.issubdtype(obj, np.unsignedinteger) + ) def is_string_dtype(obj): From de9ca28f86b46a6fe9cd93be58e865cd6a8afd96 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 22 Mar 2021 19:49:47 -0700 Subject: [PATCH 115/138] Change error message back so that tests pass. --- python/cudf/cudf/core/column_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 44484927985..d2bab50a8ba 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -78,7 +78,7 @@ def __init__( if not isinstance(v, column.ColumnBase): v = column.as_column(v) if len(v) != column_length: - raise ValueError("All columns must be of equal length") + raise ValueError("All values must be of equal length") self._data[k] = v self.multiindex = multiindex From e35d03b339dc008e8f264dec9d86a8417f3a77db Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 10:47:05 -0400 Subject: [PATCH 116/138] Faster is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1438421bb12..8aa0e05bb07 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -148,11 +148,18 @@ def is_numerical_dtype(obj): return False if is_list_dtype(obj): return False + # convert to an np.dtype object first, + # otherwise each of the np.issubdtype() calls + # below will be slow. + try: + dtype = np.dtype(obj) + except TypeError: + return False return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - or np.issubdtype(obj, np.unsignedinteger) + np.issubdtype(dtype, np.bool_) + or np.issubdtype(dtype, np.floating) + or np.issubdtype(dtype, np.signedinteger) + or np.issubdtype(dtype, np.unsignedinteger) ) From e2fd53369a554cfa887a137261baafdd94854bcd Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 10:51:45 -0400 Subject: [PATCH 117/138] Faster is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8aa0e05bb07..225450d84b3 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -148,19 +148,11 @@ def is_numerical_dtype(obj): return False if is_list_dtype(obj): return False - # convert to an np.dtype object first, - # otherwise each of the np.issubdtype() calls - # below will be slow. try: dtype = np.dtype(obj) except TypeError: return False - return ( - np.issubdtype(dtype, np.bool_) - or np.issubdtype(dtype, np.floating) - or np.issubdtype(dtype, np.signedinteger) - or np.issubdtype(dtype, np.unsignedinteger) - ) + return dtype.kind in "biuf" def is_string_dtype(obj): From 64ca702d44d2c75463c19bfd2e7a762e1b7d7717 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 11:12:39 -0400 Subject: [PATCH 118/138] Even faster is_numerical_dtype --- python/cudf/cudf/utils/dtypes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 225450d84b3..4080d9cff9c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,10 +144,6 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if is_categorical_dtype(obj): - return False - if is_list_dtype(obj): - return False try: dtype = np.dtype(obj) except TypeError: From 749edf18897ae1667eb6fa34972f76c25e2bec5f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 11:23:03 -0400 Subject: [PATCH 119/138] Enable fast path for constructing a Buffer from a DeviceBuffer --- python/cudf/cudf/core/buffer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 350346a87f9..9fc5570e35a 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -42,6 +42,10 @@ def __init__( self.ptr = data.ptr self.size = data.size self._owner = owner or data._owner + elif isinstance(data, rmm.DeviceBuffer): + self.ptr = data.ptr + self.size = data.size + self._owner = data elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" ): From ca772b8ca46c3ae11b7232c33599387f0b42af65 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 12:02:31 -0400 Subject: [PATCH 120/138] Small fix --- python/cudf/cudf/core/join/join.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0d45abb76c2..d066d40d052 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -452,6 +452,6 @@ def __init__(self, *args, **kwargs): def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: # semi-join result includes only lhs columns if issubclass(self._out_class, cudf.Index): - return self._out_class._from_data(lhs) + return self._out_class._from_data(lhs._data) else: return self._out_class._from_data(lhs._data, index=lhs._index) From 739ec57975ae1fa2817633da577989162c01ef93 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 23 Mar 2021 09:49:20 -0700 Subject: [PATCH 121/138] Add validation option to insert and standardize error message. --- python/cudf/cudf/core/column_accessor.py | 11 +++++++++-- python/cudf/cudf/tests/test_dataframe.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index d2bab50a8ba..add0570fc8f 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -78,7 +78,7 @@ def __init__( if not isinstance(v, column.ColumnBase): v = column.as_column(v) if len(v) != column_length: - raise ValueError("All values must be of equal length") + raise ValueError("All columns must be of equal length") self._data[k] = v self.multiindex = multiindex @@ -195,7 +195,7 @@ def to_pandas_index(self) -> pd.Index: result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name: Any, value: Any, loc: int = -1): + def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): """ Insert column into the ColumnAccessor at the specified location. @@ -225,6 +225,13 @@ def insert(self, name: Any, value: Any, loc: int = -1): if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") if loc == len(self._data): + if validate: + value = column.as_column(value) + if len(self._data) > 0: + if len(value) != self._column_length: + raise ValueError("All columns must be of equal length") + else: + self._column_length = len(value) self._data[name] = value else: new_keys = self.names[:loc] + (name,) + self.names[loc:] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b3ba439cb15..76a02d5e74a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5222,7 +5222,7 @@ def test_memory_usage_multi(): def test_setitem_diff_size_list(list_input, key): gdf = cudf.datasets.randomdata(5) with pytest.raises( - ValueError, match=("All values must be of equal length") + ValueError, match=("All columns must be of equal length") ): gdf[key] = list_input From 498b70ed8b337fd412759504770d85acfe57094b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 23 Mar 2021 10:22:15 -0700 Subject: [PATCH 122/138] Fix style. --- python/cudf/cudf/core/column_accessor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index add0570fc8f..0c580132290 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -195,7 +195,9 @@ def to_pandas_index(self) -> pd.Index: result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): + def insert( + self, name: Any, value: Any, loc: int = -1, validate: bool = True + ): """ Insert column into the ColumnAccessor at the specified location. From 01e13fa62bba2ad0b4b34e5574c2152291d65ee2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 17:08:04 -0400 Subject: [PATCH 123/138] Undo formatting change --- python/cudf/cudf/core/column_accessor.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index e2233423db4..68ce4c4c070 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -176,11 +176,7 @@ def _column_length(self): return 0 def _clear_cache(self): - cached_properties = ( - "columns", - "names", - "_grouped_data", - ) + cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: try: self.__delattr__(attr) From 89a03013ef99452e7b12bb6380d98f3cde0635ba Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 17:10:16 -0400 Subject: [PATCH 124/138] Add TODO --- python/cudf/cudf/utils/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 4080d9cff9c..8875a36dba8 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,6 +144,8 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): + # TODO: we should handle objects with a `.dtype` attribute, + # e.g., arrays, here. try: dtype = np.dtype(obj) except TypeError: From 5e73de76451740ce5b52694c1c501f92e7429d25 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 23 Mar 2021 20:21:36 -0400 Subject: [PATCH 125/138] init->create + doc --- python/cudf/cudf/_lib/table.pyx | 2 +- python/cudf/cudf/core/column_accessor.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index 8b83de1e31c..0d6e9c16e8c 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -114,7 +114,7 @@ cdef class Table: for _ in column_names: data_columns.append(Column.from_unique_ptr(move(dereference(it)))) it += 1 - data = ColumnAccessor._init_unsafe( + data = ColumnAccessor._create_unsafe( dict(zip(column_names, data_columns)) ) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 68ce4c4c070..33bae5c1328 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -81,12 +81,14 @@ def __init__( self._level_names = level_names @classmethod - def _init_unsafe( + def _create_unsafe( cls, data: Dict[Any, ColumnBase], multiindex: bool = False, level_names=None, ) -> ColumnAccessor: + # create a ColumnAccessor without verifying column + # type or size obj = cls() obj._data = data obj.multiindex = multiindex From ca116a37c872bc59fd76edc534fa75b7e4b30727 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 25 Mar 2021 09:40:31 -0400 Subject: [PATCH 126/138] Only gather the index if necessary --- python/cudf/cudf/core/join/join.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index d3bd835bb80..0b1c68bd64b 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -179,10 +179,15 @@ def perform_merge(self) -> Frame: left_result = cudf.core.frame.Frame() right_result = cudf.core.frame.Frame() + gather_index = self.left_index or self.right_index if left_rows is not None: - left_result = lhs._gather(left_rows, nullify=True) + left_result = lhs._gather( + left_rows, nullify=True, keep_index=gather_index + ) if right_rows is not None: - right_result = rhs._gather(right_rows, nullify=True) + right_result = rhs._gather( + right_rows, nullify=True, keep_index=gather_index + ) result = self._merge_results(left_result, right_result) From ce03918eb2c792ee35a2b49c0e62d10ef4c6c66d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 25 Mar 2021 09:48:23 -0400 Subject: [PATCH 127/138] Don't copy type metadata for the index unless we need to --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 330398e302f..f1181feb692 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -623,7 +623,7 @@ def _gather(self, gather_map, keep_index=True, nullify=False): nullify=nullify, ) ) - result._copy_type_metadata(self) + result._copy_type_metadata(self, include_index=keep_index) if keep_index and self._index is not None: result._index.names = self._index.names return result From b7c6b198316eff1b0b64d3ddb5691723a6bb6f36 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 25 Mar 2021 10:21:33 -0400 Subject: [PATCH 128/138] Use validate=False in a few more places --- python/cudf/cudf/core/join/_join_helpers.py | 12 +++++++----- python/cudf/cudf/core/join/join.py | 13 +++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 1fb380f8697..7d322fdbc91 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -42,13 +42,15 @@ def get(self, obj: Frame) -> ColumnBase: return obj._index._data[self.name] raise KeyError() - def set(self, obj: Frame, value: ColumnBase): + def set(self, obj: Frame, value: ColumnBase, validate=False): # set the colum in `obj` if self.column: - obj._data[self.name] = value + obj._data.set_by_label(self.name, value, validate=validate) else: if obj._index is not None: - obj._index._data[self.name] = value + obj._index._data.set_by_label( + self.name, value, validate=validate + ) else: raise KeyError() @@ -63,9 +65,9 @@ def _frame_select_by_indexers( for idx in indexers: if idx.index: - index_data[idx.name] = idx.get(frame) + index_data.set_by_label(idx.name, idx.get(frame), validate=False) else: - data[idx.name] = idx.get(frame) + data.set_by_label(idx.name, idx.get(frame), validate=False) result_index = cudf.Index._from_data(index_data) if index_data else None result = cudf.core.frame.Frame(data=data, index=result_index) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0b1c68bd64b..e1e1028f803 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -269,6 +269,7 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: lkey.set( left_result, lkey.get(left_result).fillna(rkey.get(right_result)), + validate=False, ) # Compute the result column names: @@ -425,9 +426,9 @@ def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: lcol, rcol, how=self.how ) if lcol is not lcol_casted: - left_key.set(out_lhs, lcol_casted) + left_key.set(out_lhs, lcol_casted, validate=False) if rcol is not rcol_casted: - right_key.set(out_rhs, rcol_casted) + right_key.set(out_rhs, rcol_casted, validate=False) return out_lhs, out_rhs def _restore_categorical_keys( @@ -446,10 +447,14 @@ def _restore_categorical_keys( right_key.get(self.rhs).dtype, cudf.CategoricalDtype ): left_key.set( - out_lhs, left_key.get(out_lhs).astype("category") + out_lhs, + left_key.get(out_lhs).astype("category"), + validate=False, ) right_key.set( - out_rhs, right_key.get(out_rhs).astype("category") + out_rhs, + right_key.get(out_rhs).astype("category"), + validate=False, ) return out_lhs, out_rhs From 671a0e096a4a44b6472db9e5c9a31cb986260452 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 13:06:43 -0400 Subject: [PATCH 129/138] Import --- python/cudf/cudf/core/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f1181feb692..32c958f506f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3355,8 +3355,6 @@ def _merge( indicator=False, suffixes=("_x", "_y"), ): - from cudf.core.join.join import merge - lhs, rhs = self, right if how == "right": # Merge doesn't support right, so just swap @@ -3366,7 +3364,7 @@ def _merge( left_index, right_index = right_index, left_index suffixes = (suffixes[1], suffixes[0]) - return merge( + return cudf.core.join.merge( lhs, rhs, on=on, From 797087b57021f7e2e175cf6dff89aca49fb082a6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 14:07:06 -0400 Subject: [PATCH 130/138] Review --- python/cudf/cudf/core/frame.py | 3 ++- python/cudf/cudf/core/join/_join_helpers.py | 4 --- python/cudf/cudf/core/join/join.py | 30 ++++++++++++--------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 32c958f506f..fb746d6c794 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -20,6 +20,7 @@ from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries from cudf.core.column import as_column, build_categorical_column, column_empty +from cudf.core.join import merge from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, @@ -3364,7 +3365,7 @@ def _merge( left_index, right_index = right_index, left_index suffixes = (suffixes[1], suffixes[0]) - return cudf.core.join.merge( + return merge( lhs, rhs, on=on, diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 7d322fdbc91..0a20067b41d 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -200,7 +200,3 @@ def _coerce_to_tuple(obj): return tuple(obj) else: return (obj,) - - -def _coerce_to_list(obj): - return list(_coerce_to_tuple(obj)) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index e1e1028f803..2bb959bc0ad 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,13 +2,12 @@ from __future__ import annotations import functools -from collections import OrderedDict, namedtuple +from collections import namedtuple from typing import TYPE_CHECKING, Callable, Tuple import cudf from cudf import _lib as libcudf from cudf.core.join._join_helpers import ( - _coerce_to_list, _coerce_to_tuple, _frame_select_by_indexers, _Indexer, @@ -275,8 +274,8 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: # Compute the result column names: # left_names and right_names will be a mappings of input column names # to the corresponding names in the final result. - left_names = OrderedDict(zip(left_result._data, left_result._data)) - right_names = OrderedDict(zip(right_result._data, right_result._data)) + left_names = dict(zip(left_result._data, left_result._data)) + right_names = dict(zip(right_result._data, right_result._data)) # For any columns from left_result and right_result that have the same # name: @@ -288,12 +287,14 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: if self.on: key_columns_with_same_name = self.on else: - key_columns_with_same_name = [] - for lkey, rkey in zip(*self._keys): - if (lkey.index, rkey.index) == (False, False): - if lkey.name == rkey.name: - key_columns_with_same_name.append(lkey.name) - + key_columns_with_same_name = [ + lkey.name + for lkey, rkey in zip(*self._keys) + if ( + (lkey.index, rkey.index) == (False, False) + and lkey.name == rkey.name + ) + ] for name in common_names: if name not in key_columns_with_same_name: left_names[name] = f"{name}{self.lsuffix}" @@ -339,7 +340,10 @@ def _sort_result(self, result: Frame) -> Frame: if isinstance(result, cudf.Index): sort_order = result._get_sorted_inds() else: - sort_order = result._get_sorted_inds(_coerce_to_list(self.on)) + # need a list instead of a tuple here because + # _get_sorted_inds calls down to ColumnAccessor.get_by_label + # which handles lists and tuples differently + sort_order = result._get_sorted_inds(list(self.on)) return result._gather(sort_order, keep_index=False) by = [] if self.left_index and self.right_index: @@ -347,11 +351,11 @@ def _sort_result(self, result: Frame) -> Frame: by.extend(result._index._data.columns) if self.left_on: by.extend( - [result._data[col] for col in _coerce_to_list(self.left_on)] + [result._data[col] for col in _coerce_to_tuple(self.left_on)] ) if self.right_on: by.extend( - [result._data[col] for col in _coerce_to_list(self.right_on)] + [result._data[col] for col in _coerce_to_tuple(self.right_on)] ) if by: to_sort = cudf.DataFrame._from_columns(by) From 5ad531fa0858e688bfea0434e54117f040dd6dd7 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 14:10:38 -0400 Subject: [PATCH 131/138] Coerce to tuple first --- python/cudf/cudf/core/join/join.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 2bb959bc0ad..1a4826d0570 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -343,7 +343,9 @@ def _sort_result(self, result: Frame) -> Frame: # need a list instead of a tuple here because # _get_sorted_inds calls down to ColumnAccessor.get_by_label # which handles lists and tuples differently - sort_order = result._get_sorted_inds(list(self.on)) + sort_order = result._get_sorted_inds( + list(_coerce_to_tuple(self.on)) + ) return result._gather(sort_order, keep_index=False) by = [] if self.left_index and self.right_index: From f7e94fb0264eb590afddf1c256f5945d387c4b28 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 14:13:05 -0400 Subject: [PATCH 132/138] Replace hasattr with isinstance --- python/cudf/cudf/core/join/_join_helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 0a20067b41d..3807f408369 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -1,6 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. from __future__ import annotations +import collections import warnings from typing import TYPE_CHECKING, Any, Iterable, Tuple @@ -196,7 +197,7 @@ def _match_categorical_dtypes_both( def _coerce_to_tuple(obj): - if hasattr(obj, "__iter__") and not isinstance(obj, str): + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): return tuple(obj) else: return (obj,) From 1cb944804d5dd4d54066d2547a4f69da151228bb Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 15:07:13 -0400 Subject: [PATCH 133/138] Handle renamed indexes --- python/cudf/cudf/core/multiindex.py | 12 ++++++++++++ python/cudf/cudf/tests/test_joining.py | 13 +++++++++++++ 2 files changed, 25 insertions(+) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a82735be901..e6cee7c1038 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -190,6 +190,18 @@ def names(self): def names(self, value): value = [None] * self.nlevels if value is None else value assert len(value) == self.nlevels + + if len(value) == len(set(value)): + # IMPORTANT: if the provided names are unique, + # we reconstruct self._data with the names as keys. + # If they are not unique, the keys of self._data + # and self._names will be different, which can lead + # to unexpected behaviour in some cases. This is + # definitely buggy, but we can't disallow non-unique + # names either... + self._data = self._data._create_unsafe( + dict(zip(value, self._data.values())) + ) self._names = pd.core.indexes.frozen.FrozenList(value) def rename(self, names, inplace=False): diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 50141428b02..9164bfe98d1 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1725,3 +1725,16 @@ def test_merge_with_lists(how): got = gd_left.merge(gd_right, on="a") assert_join_results_equal(expect, got, how=how) + + +def test_join_renamed_index(): + df = cudf.DataFrame( + {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]} + ).set_index([0, 1]) + df.index.names = ["a", "b"] # doesn't actually change df._index._data + + expect = df.to_pandas().merge( + df.to_pandas(), left_index=True, right_index=True + ) + got = df.merge(df, left_index=True, right_index=True, how="inner") + assert_join_results_equal(expect, got, how="inner") From cc89360b7b44356023287e16b81c1b8201e1a5b8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 15:10:04 -0400 Subject: [PATCH 134/138] Fix to names setter --- python/cudf/cudf/core/multiindex.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e6cee7c1038..1c1e48e7372 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -199,8 +199,9 @@ def names(self, value): # to unexpected behaviour in some cases. This is # definitely buggy, but we can't disallow non-unique # names either... - self._data = self._data._create_unsafe( - dict(zip(value, self._data.values())) + self._data = self._data.__class__._create_unsafe( + dict(zip(value, self._data.values())), + level_names=self._data.level_names, ) self._names = pd.core.indexes.frozen.FrozenList(value) From 9cebf2ee4f6b8faa8e97633cfb02c15d929df9c3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 26 Mar 2021 15:56:52 -0400 Subject: [PATCH 135/138] Update cpp/src/join/hash_join.cu Co-authored-by: Mark Harris --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 90ecae4e3f4..3f11d6d3306 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -114,7 +114,7 @@ get_left_join_indices_complement( // Assume all the indices in invalid_index_map are invalid auto invalid_index_map = std::make_unique>(right_table_row_count, stream); - thrust::uninitialized_fill(thrust::cuda::par.on(stream.value()), + thrust::uninitialized_fill(rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); From 1584b861a90d59ea408f5696874ca68fd08f1147 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 16:00:11 -0400 Subject: [PATCH 136/138] Better example --- cpp/include/cudf/join.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 7c778a17609..fcc0bcd444e 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -78,10 +78,10 @@ inner_join(cudf::table_view const& left_keys, * * @code{.pseudo} * Left: {{0, 1, 2}} - * Right: {{1, 2, 3}, {1, 2, 5}} + * Right: {{4, 9, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * Result: {{1, 2}, {1, 2}, {1, 2}} + * Result: {{1, 2}, {4, 9}, {1, 2}} * @endcode * * @throw cudf::logic_error if number of elements in `left_on` or `right_on` From 3977b793451d7687e17f4712a67227f850d87ef1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 16:09:33 -0400 Subject: [PATCH 137/138] Remove std::moves --- cpp/src/join/semi_join.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index fea9ea45fd3..80a1ef9e204 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -55,7 +55,7 @@ std::unique_ptr> left_semi_anti_join( auto result = std::make_unique>(left_keys.num_rows(), stream, mr); thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end()); - return std::move(result); + return result; } auto const left_num_rows = left_keys.num_rows(); @@ -114,7 +114,7 @@ std::unique_ptr> left_semi_anti_join( auto join_size = thrust::distance(gather_map->begin(), gather_map_end); gather_map->resize(join_size, stream); - return std::move(gather_map); + return gather_map; } /** From 7bf65611212a4f3c2e7454168d05d6b23d93c85b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 26 Mar 2021 16:29:57 -0400 Subject: [PATCH 138/138] Fix formatting error --- cpp/src/join/hash_join.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 3f11d6d3306..5a6ad8892de 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -114,10 +114,8 @@ get_left_join_indices_complement( // Assume all the indices in invalid_index_map are invalid auto invalid_index_map = std::make_unique>(right_table_row_count, stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), - invalid_index_map->begin(), - invalid_index_map->end(), - int32_t{1}); + thrust::uninitialized_fill( + rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); // Functor to check for index validity since left joins can create invalid indices valid_range valid(0, right_table_row_count);