diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu index bd013afc451..fa6afdd908c 100644 --- a/cpp/benchmarks/join/join_benchmark.cu +++ b/cpp/benchmarks/join/join_benchmark.cu @@ -105,12 +105,8 @@ static void BM_join(benchmark::State &state) for (auto _ : state) { cuda_event_timer raii(state, true, 0); - auto result = cudf::inner_join(probe_table, - build_table, - columns_to_join, - columns_to_join, - {{0, 0}}, - cudf::null_equality::UNEQUAL); + auto result = cudf::inner_join( + probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL); } } diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b2c1296ccef..fcc0bcd444e 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -20,6 +20,7 @@ #include #include +#include #include @@ -30,6 +31,44 @@ namespace cudf { * @file */ +/** + * @brief Returns a pair of row index vectors corresponding to an + * inner join between the specified tables. + * + * The first returned vector contains the row indices from the left + * table that have a match in the right table (in unspecified order). + * The corresponding values in the second returned vector are + * the matched row indices from the right table. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Result: {{1, 2}, {0, 1}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Result: {{1}, {0}} + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param[in] left_keys The left table + * @param[in] right_keys The right table + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing an inner join between two tables with `left_keys` and `right_keys` + * as the join keys . + */ +std::pair>, + std::unique_ptr>> +inner_join(cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs an inner join on the specified columns of two * tables (`left`, `right`) @@ -38,26 +77,13 @@ namespace cudf { * in the columns being joined on match. * * @code{.pseudo} - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, a: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right: {{4, 9, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * columns_in_common: { {0, 1} } - * Result: { a: {1, 2}, b: {1, 2} } - * - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} - * left_on: {0} - * right_on: {0} - * columns_in_common: { } - * Result: { a: {1, 2}, b: {1, 2}, c: {1, 2} } + * Result: {{1, 2}, {4, 9}, {1, 2}} * @endcode * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) such that the location of `L` within `left_on` is not equal to - * location of R within `right_on` * @throw cudf::logic_error if number of elements in `left_on` or `right_on` * mismatch. * @throw cudf::logic_error if number of columns in either `left` or `right` @@ -73,59 +99,83 @@ namespace cudf { * @param[in] right_on The column indices from `right` to join on. * The column from `right` indicated by `right_on[i]` will be compared against the column * from `left` indicated by `left_on[i]`. - * @param[in] columns_in_common is a vector of pairs of column indices into - * `left` and `right`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `left_on` columns. Else, for every column in `left_on` and `right_on`, - * an output column will be produced. For each of these pairs (L, R), L - * should exist in `left_on` and R should exist in `right_on`. * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table will be joined columns of - * `left(including common columns)+right(excluding common columns)`. + * specified by `left_on` and `right_on`. */ std::unique_ptr inner_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a pair of row index vectors corresponding to a + * left join between the specified tables. + * + * The first returned vector contains all the row indices from the left + * table (in unspecified order). The corresponding value in the + * second returned vector is either (1) the row index of the matched row + * from the right table, if there is a match or (2) an unspecified + * out-of-bounds value. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Result: {{0, 1, 2}, {None, 0, 1}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Result: {{0, 1, 2}, {None, 0, None}} + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param[in] left_keys The left table + * @param[in] right_keys The right table + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a left join between two tables with `left_keys` and `right_keys` + * as the join keys . + */ +std::pair>, + std::unique_ptr>> +left_join(cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left join (also known as left outer join) on the * specified columns of two tables (`left`, `right`) * - * Left Join returns all the rows from the left table and those rows from the + * Left join returns all the rows from the left table and those rows from the * right table that match on the joined columns. * For rows from the right table that do not have a match, the corresponding * values in the left columns will be null. * * @code{.pseudo} - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, a: {1 ,2 ,5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2 ,5}} * left_on: {0} * right_on: {1} - * columns_in_common: { {0, 1} } - * Result: { a: {0, 1, 2}, b: {NULL, 1, 2} } + * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} } * - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {0} - * columns_in_common: { } - * Result: { a: {0, 1, 2}, b: {NULL, 1, 2}, c: {NULL, 1, 2} } + * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} } * @endcode * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) such that the location of `L` within `left_on` is not equal to - * location of R within `right_on` * @throw cudf::logic_error if number of elements in `left_on` or `right_on` * mismatch. * @throw cudf::logic_error if number of columns in either `left` or `right` @@ -141,29 +191,59 @@ std::unique_ptr inner_join( * @param[in] right_on The column indices from `right` to join on. * The column from `right` indicated by `right_on[i]` will be compared against the column * from `left` indicated by `left_on[i]`. - * @param[in] columns_in_common is a vector of pairs of column indices into - * `left` and `right`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `left_on` columns. Else, for every column in `left_on` and `right_on`, - * an output column will be produced. For each of these pairs (L, R), L - * should exist in `left_on` and R should exist in `right_on`. * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table will be joined columns of - * `left(including common columns)+right(excluding common columns)`. + * specified by `left_on` and `right_on`. */ std::unique_ptr left_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a pair of row index vectors corresponding to a + * full join between the specified tables. + * + * Taken pairwise, the values from the returned vectors are one of: + * (1) row indices corresponding to matching rows from the left and + * right tables, (2) a row index and an unspecified out-of-bounds value, + * representing a row from one table without a match in the other. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Result: {{0, 1, 2, None}, {None, 0, 1, 2}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param[in] left The left table + * @param[in] right The right table + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a full join between two tables with `left_keys` and `right_keys` + * as the join keys . + */ +std::pair>, + std::unique_ptr>> +full_join(cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a full join (also known as full outer join) on the * specified columns of two tables (`left`, `right`) @@ -174,26 +254,19 @@ std::unique_ptr left_join( * values in the left columns will be null. * * @code{.pseudo} - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * columns_in_common: { {0, 1} } - * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} } + * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} } * - * Left a: {0, 1, 2} - * Right b: {1, 2, 3}, c: {1, 2, 5} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {0} - * columns_in_common: { } - * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} } + * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} } * @endcode * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (L, R) such that the location of `L` within `left_on` is not equal to - * location of R within `right_on` * @throw cudf::logic_error if number of elements in `left_on` or `right_on` * mismatch. * @throw cudf::logic_error if number of columns in either `left` or `right` @@ -209,28 +282,54 @@ std::unique_ptr left_join( * @param[in] right_on The column indices from `right` to join on. * The column from `right` indicated by `right_on[i]` will be compared against the column * from `left` indicated by `left_on[i]`. - * @param[in] columns_in_common is a vector of pairs of column indices into - * `left` and `right`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `left_on` columns. Else, for every column in `left_on` and `right_on`, - * an output column will be produced. For each of these pairs (L, R), L - * should exist in `left_on` and R should exist in `right_on`. * @param[in] compare_nulls controls whether null join-key values * should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table will be joined columns of - * `left(including common columns)+right(excluding common columns)`. + * specified by `left_on` and `right_on`. */ std::unique_ptr full_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a vector of row indices corresponding to a left semi join + * between the specified tables. + * + * The returned vector contains the row indices from the left table + * for which there is a matching row in the right table. + * + * @code{.pseudo} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}} + * right_on: {1} + * Result: {1, 2} + * @endcode + * + * @throw cudf::logic_error if number of columns in either + * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE + * + * @param[in] left_keys The left table + * @param[in] right_keys The right table + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A vector `left_indices` that can be used to construct + * the result of performing a left semi join between two tables with + * `left_keys` and `right_keys` as the join keys . + */ +std::unique_ptr> left_semi_join( + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a left semi join on the specified columns of two * tables (`left`, `right`) @@ -239,24 +338,20 @@ std::unique_ptr full_join( * returns rows that exist in the right table. * * @code{.pseudo} - * TableA a: {0, 1, 2} - * TableB b: {1, 2, 3}, a: {1, 2, 5} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * return_columns: { 0 } - * Result: { a: {1, 2} } + * Result: { {1, 2} } * - * TableA a: {0, 1, 2}, c: {1, 2, 5} - * TableB b: {1, 2, 3} + * TableA {{0, 1, 2}, {1, 2, 5}} + * TableB {{1, 2, 3}} * left_on: {0} * right_on: {0} - * return_columns: { 1 } - * Result: { c: {1, 2} } + * Result: { {1, 2}, {2, 5} } * @endcode * - * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0 - * @throw cudf::logic_error if the number of returned columns is 0 - * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal + * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 * * @param[in] left The left table * @param[in] right The right table @@ -268,22 +363,49 @@ std::unique_ptr full_join( * The column from `right` indicated by `right_on[i]` * will be compared against the column from `left` * indicated by `left_on[i]`. - * @param[in] return_columns A vector of column indices from `left` to - * include in the returned table. * @param[in] compare_nulls Controls whether null join-key values should match or not. * @param[in] mr Device memory resource used to allocate the returned table's * device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table - * will contain `return_columns` from `left` that match in right. + * specified by `left_on` and `right_on`. */ std::unique_ptr left_semi_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a vector of row indices corresponding to a left anti join + * between the specified tables. + * + * The returned vector contains the row indices from the left table + * for which there is no matching row in the right table. + * + * @code{.pseudo} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}} + * Result: {0} + * @endcode + * + * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 + * + * @param[in] left_keys The left table + * @param[in] right_keys The right table + * @param[in] compare_nulls controls whether null join-key values + * should match or not. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A column `left_indices` that can be used to construct + * the result of performing a left anti join between two tables with + * `left_keys` and `right_keys` as the join keys . + */ +std::unique_ptr> left_anti_join( + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -295,24 +417,23 @@ std::unique_ptr left_semi_join( * returns rows that do not exist in the right table. * * @code{.pseudo} - * TableA a: {0, 1, 2} - * TableB b: {1, 2, 3}, a: {1, 2, 5} + * TableA: {{0, 1, 2}} + * TableB: {{1, 2, 3}, {1, 2, 5}} * left_on: {0} * right_on: {1} - * return_columns: { 0 } - * Result: { a: {0} } + * Result: {{0}, {1}} * - * TableA a: {0, 1, 2}, c: {1, 2, 5} - * TableB b: {1, 2, 3} + * TableA: {{0, 1, 2}, {1, 2, 5}} + * TableB: {{1, 2, 3}} * left_on: {0} * right_on: {0} - * return_columns: { 1 } - * Result: { c: {1} } + * Result: { {0} {1} } * @endcode * - * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0 - * @throw cudf::logic_error if the number of returned columns is 0 - * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal + * @throw cudf::logic_error if number of elements in `left_on` or `right_on` + * mismatch. + * @throw cudf::logic_error if number of columns in either `left` or `right` + * table is 0 or exceeds MAX_JOIN_SIZE * * @param[in] left The left table * @param[in] right The right table @@ -324,22 +445,18 @@ std::unique_ptr left_semi_join( * The column from `right` indicated by `right_on[i]` * will be compared against the column from `left` * indicated by `left_on[i]`. - * @param[in] return_columns A vector of column indices from `left` to - * include in the returned table. * @param[in] compare_nulls Controls whether null join-key values should match or not. * @param[in] mr Device memory resource used to allocate the returned table's * device memory * * @return Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table - * will contain `return_columns` from `left` that match in right. + * specified by `left_on` and `right_on`. */ std::unique_ptr left_anti_join( cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -393,128 +510,75 @@ class hash_join { * undefined. * * @param build The build table, from which the hash table is built. - * @param build_on The column indices from `build` to join on. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches */ hash_join(cudf::table_view const& build, - std::vector const& build_on, null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @brief Controls where common columns will be output for a inner join. - */ - enum class common_columns_output_side { - PROBE, ///< Common columns is output in the probe portion of the table pair returned by - ///< `inner_join`. - BUILD ///< Common columns is output in the build portion of the table pair returned by - ///< `inner_join`. - }; - - /** - * @brief Performs an inner join by probing in the internal hash table. - * - * Given that it is sometimes desired to choose the small table to be the `build` side for an - * inner join,a (`probe`, `build`) table pair, which contains the probe and build portions of the - * logical joined table respectively, is returned so that caller can freely rearrange them to - * restore the logical `left` `right` order. This introduces some extra logic about where "common" - * columns should go, i.e. the legacy `cudf::inner_join()` API always outputs "common" columns in - * the `left` portion and the corresponding columns in the `right` portion are omitted. To better - * align with the legacy `cudf::inner_join()` API, a `common_columns_output_side` parameter is - * introduced to specify whether "common" columns should go in `probe` or `build` portion. - * - * More details please @see cudf::inner_join(). + * Returns the row indices that can be used to construct the result of performing + * an inner join between two tables. @see cudf::inner_join(). * * @param probe The probe table, from which the tuples are probed. - * @param probe_on The column indices from `probe` to join on. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns or `build_on` columns if `probe_output_side` is LEFT or RIGHT. - * Else, for every column in `probe_on` and `build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `build_on`. - * @param common_columns_output_side @see `common_columns_output_side`. * @param compare_nulls Controls whether null join-key values should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device * memory. * @param stream CUDA stream used for device memory operations and kernel launches * - * @return Table pair of (`probe`, `build`) of joining both tables on the columns - * specified by `probe_on` and `build_on`. The resulting table pair will be joined columns of - * (`probe(including common columns)`, `build(excluding common columns)`) if - * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, - * `build(including common columns)`) if `common_columns_output_side` is `BUILD`. + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing an inner join between two tables with `build` and `probe` + * as the the join keys . */ - std::pair, std::unique_ptr> inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair>, + std::unique_ptr>> + inner_join(cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** - * @brief Performs a left join by probing in the internal hash table. - * - * More details please @see cudf::left_join(). + * Returns the row indices that can be used to construct the result of performing + * a left join between two tables. @see cudf::left_join(). * * @param probe The probe table, from which the tuples are probed. - * @param probe_on The column indices from `probe` to join on. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `build_on`. * @param compare_nulls Controls whether null join-key values should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device * memory. * @param stream CUDA stream used for device memory operations and kernel launches * - * @return Result of joining `build` and `probe` tables on the columns - * specified by `build_on` and `probe_on`. The resulting table will be joined columns of - * `probe(including common columns)+build(excluding common columns)`. + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a left join between two tables with `build` and `probe` + * as the the join keys . */ - std::unique_ptr left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair>, + std::unique_ptr>> + left_join(cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** - * @brief Performs a full join by probing in the internal hash table. - * - * More details please @see cudf::full_join(). + * Returns the row indices that can be used to construct the result of performing + * a full join between two tables. @see cudf::full_join(). * * @param probe The probe table, from which the tuples are probed. - * @param probe_on The column indices from `probe` to join on. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `build_on`. * @param compare_nulls Controls whether null join-key values should match or not. * @param mr Device memory resource used to allocate the returned table and columns' device * memory. * @param stream CUDA stream used for device memory operations and kernel launches * - * @return Result of joining `build` and `probe` tables on the columns - * specified by `build_on` and `probe_on`. The resulting table will be joined columns of - * `probe(including common columns)+build(excluding common columns)`. + * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a full join between two tables with `build` and `probe` + * as the the join keys . */ - std::unique_ptr full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; + std::pair>, + std::unique_ptr>> + full_join(cudf::table_view const& probe, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; private: struct hash_join_impl; diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 5cdecab9115..a225e590f9a 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -126,6 +126,11 @@ class table_view_base { */ size_type num_rows() const noexcept { return _num_rows; } + /** + * @brief Returns true if `num_columns()` returns zero, or false otherwise + */ + size_type is_empty() const noexcept { return num_columns() == 0; } + table_view_base() = default; ~table_view_base() = default; diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index dc153e9395d..181752d18e8 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -43,9 +43,7 @@ std::unique_ptr gather(table_view const& source_table, if (neg_indices == negative_index_policy::ALLOWED) { cudf::size_type n_rows = source_table.num_rows(); - auto idx_converter = [n_rows] __device__(size_type in) { - return ((in % n_rows) + n_rows) % n_rows; - }; + auto idx_converter = [n_rows] __device__(size_type in) { return in < 0 ? in + n_rows : in; }; return gather(source_table, thrust::make_transform_iterator(map_begin, idx_converter), thrust::make_transform_iterator(map_end, idx_converter), diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index d827d03a6c0..5a6ad8892de 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include @@ -20,93 +21,44 @@ #include #include -#include +#include #include +#include #include namespace cudf { namespace detail { -/** - * @brief Returns a vector with non-common indices which is set difference - * between `[0, num_columns)` and index values in common_column_indices - * - * @param num_columns The number of columns, which represents column indices - * from `[0, num_columns)` in a table - * @param common_column_indices A vector of common indices which needs to be - * excluded from `[0, num_columns)` - * - * @return vector A vector containing only the indices which are not present in - * `common_column_indices` - */ -auto non_common_column_indices(size_type num_columns, - std::vector const &common_column_indices) -{ - CUDF_EXPECTS(common_column_indices.size() <= static_cast(num_columns), - "Too many columns in common"); - std::vector all_column_indices(num_columns); - std::iota(std::begin(all_column_indices), std::end(all_column_indices), 0); - std::vector sorted_common_column_indices{common_column_indices}; - std::sort(std::begin(sorted_common_column_indices), std::end(sorted_common_column_indices)); - std::vector non_common_column_indices(num_columns - common_column_indices.size()); - std::set_difference(std::cbegin(all_column_indices), - std::cend(all_column_indices), - std::cbegin(sorted_common_column_indices), - std::cend(sorted_common_column_indices), - std::begin(non_common_column_indices)); - return non_common_column_indices; -} - std::pair, std::unique_ptr
> get_empty_joined_table( - table_view const &probe, - table_view const &build, - std::vector> const &columns_in_common, - cudf::hash_join::common_columns_output_side common_columns_output_side) + table_view const &probe, table_view const &build) { - std::vector columns_to_exclude(columns_in_common.size()); - std::transform(columns_in_common.begin(), - columns_in_common.end(), - columns_to_exclude.begin(), - [common_columns_output_side](auto &col) { - return common_columns_output_side == hash_join::common_columns_output_side::PROBE - ? col.second - : col.first; - }); - std::vector non_common_indices = non_common_column_indices( - common_columns_output_side == hash_join::common_columns_output_side::PROBE - ? build.num_columns() - : probe.num_columns(), - columns_to_exclude); std::unique_ptr
empty_probe = empty_like(probe); std::unique_ptr
empty_build = empty_like(build); - if (common_columns_output_side == hash_join::common_columns_output_side::PROBE) { - table_view empty_build_view = empty_build->select(non_common_indices); - empty_build = std::make_unique
(empty_build_view); - } else { - table_view empty_probe_view = empty_probe->select(non_common_indices); - empty_probe = std::make_unique
(empty_probe_view); - } return std::make_pair(std::move(empty_probe), std::move(empty_build)); } -VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b) +VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream) { - CUDF_EXPECTS((a.first.size() == a.second.size()), + CUDF_EXPECTS((a.first->size() == a.second->size()), "Mismatch between sizes of vectors in vector pair"); - CUDF_EXPECTS((b.first.size() == b.second.size()), + CUDF_EXPECTS((b.first->size() == b.second->size()), "Mismatch between sizes of vectors in vector pair"); - if (a.first.empty()) { - return b; - } else if (b.first.empty()) { - return a; + if (a.first->is_empty()) { + return std::move(b); + } else if (b.first->is_empty()) { + return std::move(a); } - auto original_size = a.first.size(); - a.first.resize(a.first.size() + b.first.size()); - a.second.resize(a.second.size() + b.second.size()); - thrust::copy(b.first.begin(), b.first.end(), a.first.begin() + original_size); - thrust::copy(b.second.begin(), b.second.end(), a.second.begin() + original_size); - return a; + auto original_size = a.first->size(); + a.first->resize(a.first->size() + b.first->size(), stream); + a.second->resize(a.second->size() + b.second->size(), stream); + thrust::copy( + rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size); + thrust::copy(rmm::exec_policy(stream), + b.second->begin(), + b.second->end(), + a.second->begin() + original_size); + return std::move(a); } template @@ -133,16 +85,20 @@ struct valid_range { * * @return Pair of vectors containing the left join indices complement */ -std::pair, rmm::device_vector> -get_left_join_indices_complement(rmm::device_vector &right_indices, - size_type left_table_row_count, - size_type right_table_row_count, - rmm::cuda_stream_view stream) +std::pair>, + std::unique_ptr>> +get_left_join_indices_complement( + std::unique_ptr> &right_indices, + size_type left_table_row_count, + size_type right_table_row_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { // Get array of indices that do not appear in right_indices // Vector allocated for unmatched result - rmm::device_vector right_indices_complement(right_table_row_count); + auto right_indices_complement = + std::make_unique>(right_table_row_count, stream); // If left table is empty in a full join call then all rows of the right table // should be represented in the joined indices. This is an optimization since @@ -151,12 +107,16 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, // produce exactly the same result as the else path but will be faster. if (left_table_row_count == 0) { thrust::sequence(rmm::exec_policy(stream), - right_indices_complement.begin(), - right_indices_complement.end(), + right_indices_complement->begin(), + right_indices_complement->end(), 0); } else { // Assume all the indices in invalid_index_map are invalid - rmm::device_vector invalid_index_map(right_table_row_count, 1); + auto invalid_index_map = + std::make_unique>(right_table_row_count, stream); + thrust::uninitialized_fill( + rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); + // Functor to check for index validity since left joins can create invalid indices valid_range valid(0, right_table_row_count); @@ -164,11 +124,11 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, // Thus specifying that those locations are valid thrust::scatter_if(rmm::exec_policy(stream), thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0) + right_indices.size(), - right_indices.begin(), // Index locations - right_indices.begin(), // Stencil - Check if index location is valid - invalid_index_map.begin(), // Output indices - valid); // Stencil Predicate + thrust::make_constant_iterator(0) + right_indices->size(), + right_indices->begin(), // Index locations + right_indices->begin(), // Stencil - Check if index location is valid + invalid_index_map->begin(), // Output indices + valid); // Stencil Predicate size_type begin_counter = static_cast(0); size_type end_counter = static_cast(right_table_row_count); @@ -176,15 +136,19 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, size_type indices_count = thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(begin_counter), thrust::make_counting_iterator(end_counter), - invalid_index_map.begin(), - right_indices_complement.begin(), + invalid_index_map->begin(), + right_indices_complement->begin(), thrust::identity()) - - right_indices_complement.begin(); - right_indices_complement.resize(indices_count); + right_indices_complement->begin(); + right_indices_complement->resize(indices_count, stream); } - rmm::device_vector left_invalid_indices(right_indices_complement.size(), - JoinNoneValue); + auto left_invalid_indices = + std::make_unique>(right_indices_complement->size(), stream); + thrust::fill(rmm::exec_policy(stream), + left_invalid_indices->begin(), + left_invalid_indices->end(), + JoinNoneValue); return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); } @@ -195,8 +159,6 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, * @throw cudf::logic_error if the number of columns in `build` table is 0. * @throw cudf::logic_error if the number of rows in `build` table is 0. * @throw cudf::logic_error if insertion to the hash table fails. - * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build` - * table. * * @param build Table of columns used to build join hash. * @param compare_nulls Controls whether null join-key values should match or not. @@ -256,19 +218,22 @@ std::unique_ptr> build_join_ * @return Join output indices vector pair. */ template -std::pair, rmm::device_vector> probe_join_hash_table( - cudf::table_device_view build_table, - cudf::table_device_view probe_table, - multimap_type const &hash_table, - null_equality compare_nulls, - rmm::cuda_stream_view stream) +std::pair>, + std::unique_ptr>> +probe_join_hash_table(cudf::table_device_view build_table, + cudf::table_device_view probe_table, + multimap_type const &hash_table, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { size_type estimated_size = estimate_join_output_size( build_table, probe_table, hash_table, compare_nulls, stream); // If the estimated output size is zero, return immediately if (estimated_size == 0) { - return std::make_pair(rmm::device_vector{}, rmm::device_vector{}); + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } // Because we are approximating the number of joined elements, our approximation @@ -278,12 +243,13 @@ std::pair, rmm::device_vector> probe_jo rmm::device_scalar write_index(0, stream); size_type join_size{0}; - rmm::device_vector left_indices; - rmm::device_vector right_indices; + auto left_indices = std::make_unique>(0, stream, mr); + auto right_indices = std::make_unique>(0, stream, mr); + auto current_estimated_size = estimated_size; do { - left_indices.resize(estimated_size); - right_indices.resize(estimated_size); + left_indices->resize(estimated_size, stream); + right_indices->resize(estimated_size, stream); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(probe_table.num_rows(), block_size); @@ -298,8 +264,8 @@ std::pair, rmm::device_vector> probe_jo probe_table, hash_probe, equality, - left_indices.data().get(), - right_indices.data().get(), + left_indices->data(), + right_indices->data(), write_index.data(), estimated_size); @@ -310,179 +276,11 @@ std::pair, rmm::device_vector> probe_jo estimated_size *= 2; } while ((current_estimated_size < join_size)); - left_indices.resize(join_size); - right_indices.resize(join_size); + left_indices->resize(join_size, stream); + right_indices->resize(join_size, stream); return std::make_pair(std::move(left_indices), std::move(right_indices)); } -/** - * @brief Combines the non common probe, common probe, non common build and common build - * columns in the correct order according to `common_columns_output_side` to form the joined - * (`probe`, `build`) table pair. - * - * @param probe_noncommon_cols Columns obtained by gathering non common probe columns. - * @param probe_noncommon_col_indices Output locations of non common probe columns in the probe - * portion. - * @param probe_common_col_indices Output locations of common probe columns in the probe portion. - * @param build_noncommon_cols Columns obtained by gathering non common build columns. - * @param build_noncommon_col_indices Output locations of non common build columns in the build - * portion. - * @param build_common_col_indices Output locations of common build columns in the build portion. - * @param common_cols Columns obtained by gathering common columns from `probe` and `build` tables - * in the build portion. - * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side. - * - * @return Table pair of (`probe`, `build`). - */ -std::pair, std::unique_ptr
> combine_join_columns( - std::vector> &&probe_noncommon_cols, - std::vector const &probe_noncommon_col_indices, - std::vector const &probe_common_col_indices, - std::vector> &&build_noncommon_cols, - std::vector const &build_noncommon_col_indices, - std::vector const &build_common_col_indices, - std::vector> &&common_cols, - cudf::hash_join::common_columns_output_side common_columns_output_side) -{ - if (common_columns_output_side == cudf::hash_join::common_columns_output_side::PROBE) { - std::vector> probe_cols(probe_noncommon_cols.size() + - common_cols.size()); - for (size_t i = 0; i < probe_noncommon_cols.size(); ++i) { - probe_cols.at(probe_noncommon_col_indices.at(i)) = std::move(probe_noncommon_cols.at(i)); - } - for (size_t i = 0; i < common_cols.size(); ++i) { - probe_cols.at(probe_common_col_indices.at(i)) = std::move(common_cols.at(i)); - } - return std::make_pair(std::make_unique(std::move(probe_cols)), - std::make_unique(std::move(build_noncommon_cols))); - } else { - std::vector> build_cols(build_noncommon_cols.size() + - common_cols.size()); - for (size_t i = 0; i < build_noncommon_cols.size(); ++i) { - build_cols.at(build_noncommon_col_indices.at(i)) = std::move(build_noncommon_cols.at(i)); - } - for (size_t i = 0; i < common_cols.size(); ++i) { - build_cols.at(build_common_col_indices.at(i)) = std::move(common_cols.at(i)); - } - return std::make_pair(std::make_unique(std::move(probe_noncommon_cols)), - std::make_unique(std::move(build_cols))); - } -} - -/** - * @brief Gathers rows from `probe` and `build` table and returns a (`probe`, `build`) table pair, - * which contains the probe and build portions of the logical joined table respectively. - * - * @tparam JoinKind The type of join to be performed - * - * @param probe Probe side table - * @param build build side table - * @param joined_indices Pair of vectors containing row indices from which - * `probe` and `build` tables are gathered. If any row index is out of bounds, - * the contribution in the output `table` will be NULL. - * @param columns_in_common is a vector of pairs of column indices - * from tables `probe` and `build` respectively, that are "in common". - * For "common" columns, only a single output column will be produced. - * For an inner or left join, the result will be gathered from the column in - * `probe`. For a full join, the result will be gathered from both common - * columns in `probe` and `build` and concatenated to form a single column. - * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side. - * - * @return Table pair of (`probe`, `build`) containing the rows from `probe` and - * `build` specified by `joined_indices`. - * Columns in `columns_in_common` will be included in either `probe` or `build` portion as - * `common_columns_output_side` indicates. Final form would look like - * (`probe(including common columns)`, `build(excluding common columns)`) if - * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, - * `build(including common columns)`) if `common_columns_output_side` is `BUILD`. - */ -template -std::pair, std::unique_ptr
> construct_join_output_df( - table_view const &probe, - table_view const &build, - VectorPair &joined_indices, - std::vector> const &columns_in_common, - cudf::hash_join::common_columns_output_side common_columns_output_side, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - std::vector probe_common_col; - probe_common_col.reserve(columns_in_common.size()); - std::vector build_common_col; - build_common_col.reserve(columns_in_common.size()); - for (const auto &c : columns_in_common) { - probe_common_col.push_back(c.first); - build_common_col.push_back(c.second); - } - std::vector probe_noncommon_col = - non_common_column_indices(probe.num_columns(), probe_common_col); - std::vector build_noncommon_col = - non_common_column_indices(build.num_columns(), build_common_col); - - out_of_bounds_policy const bounds_policy = JoinKind != join_kind::INNER_JOIN - ? out_of_bounds_policy::NULLIFY - : out_of_bounds_policy::DONT_CHECK; - - std::unique_ptr
common_table = std::make_unique
(); - // Construct the joined columns - if (join_kind::FULL_JOIN == JoinKind) { - auto complement_indices = get_left_join_indices_complement( - joined_indices.second, probe.num_rows(), build.num_rows(), stream); - if (not columns_in_common.empty()) { - auto common_from_build = detail::gather(build.select(build_common_col), - complement_indices.second.begin(), - complement_indices.second.end(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - auto common_from_probe = detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - rmm::mr::get_current_device_resource()); - common_table = cudf::detail::concatenate( - std::vector({common_from_build->view(), common_from_probe->view()}), - stream, - mr); - } - joined_indices = concatenate_vector_pairs(complement_indices, joined_indices); - } else { - if (not columns_in_common.empty()) { - common_table = detail::gather(probe.select(probe_common_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - mr); - } - } - - // Construct the probe non common columns - std::unique_ptr
probe_table = detail::gather(probe.select(probe_noncommon_col), - joined_indices.first.begin(), - joined_indices.first.end(), - bounds_policy, - stream, - mr); - - std::unique_ptr
build_table = detail::gather(build.select(build_noncommon_col), - joined_indices.second.begin(), - joined_indices.second.end(), - bounds_policy, - stream, - mr); - - return combine_join_columns(probe_table->release(), - probe_noncommon_col, - probe_common_col, - build_table->release(), - build_noncommon_col, - build_common_col, - common_table->release(), - common_columns_output_side); -} - std::unique_ptr combine_table_pair(std::unique_ptr &&left, std::unique_ptr &&right) { @@ -499,147 +297,112 @@ std::unique_ptr combine_table_pair(std::unique_ptr &&l hash_join::hash_join_impl::~hash_join_impl() = default; hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, - std::vector const &build_on, null_equality compare_nulls, rmm::cuda_stream_view stream) - : _build(build), - _build_selected(build.select(build_on)), - _build_on(build_on), - _hash_table(nullptr) + : _build(build), _hash_table(nullptr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(0 != _build.num_columns(), "Hash join build table is empty"); CUDF_EXPECTS(_build.num_rows() < cudf::detail::MAX_JOIN_SIZE, "Build column size is too big for hash join"); - if (_build_on.empty() || 0 == build.num_rows()) { return; } + if (0 == build.num_rows()) { return; } - _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream); + _hash_table = build_join_hash_table(_build, compare_nulls, stream); } -std::pair, std::unique_ptr> -hash_join::hash_join_impl::inner_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair>, + std::unique_ptr>> +hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); + return compute_hash_join(probe, compare_nulls, stream, mr); } -std::unique_ptr hash_join::hash_join_impl::left_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair>, + std::unique_ptr>> +hash_join::hash_join_impl::left_join(cudf::table_view const &probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - auto probe_build_pair = - compute_hash_join(probe, - probe_on, - columns_in_common, - common_columns_output_side::PROBE, - compare_nulls, - stream, - mr); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + return compute_hash_join(probe, compare_nulls, stream, mr); } -std::unique_ptr hash_join::hash_join_impl::full_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair>, + std::unique_ptr>> +hash_join::hash_join_impl::full_join(cudf::table_view const &probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); - auto probe_build_pair = - compute_hash_join(probe, - probe_on, - columns_in_common, - common_columns_output_side::PROBE, - compare_nulls, - stream, - mr); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + return compute_hash_join(probe, compare_nulls, stream, mr); } template -std::pair, std::unique_ptr> -hash_join::hash_join_impl::compute_hash_join( - cudf::table_view const &probe, - std::vector const &probe_on, - std::vector> const &columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const +std::pair>, + std::unique_ptr>> +hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, "Probe column size is too big for hash join"); - CUDF_EXPECTS(_build_on.size() == probe_on.size(), + CUDF_EXPECTS(_build.num_columns() == probe.num_columns(), "Mismatch in number of columns to be joined on"); - CUDF_EXPECTS(std::all_of(columns_in_common.begin(), - columns_in_common.end(), - [this, &probe_on](auto pair) { - size_t p = std::find(probe_on.begin(), probe_on.end(), pair.first) - - probe_on.begin(); - size_t b = std::find(_build_on.begin(), _build_on.end(), pair.second) - - _build_on.begin(); - return (p != probe_on.size()) && (b != _build_on.size()) && (p == b); - }), - "Invalid values passed to columns_in_common"); - - if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) { - return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side); + if (is_trivial_join(probe, _build, JoinKind)) { + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } - auto probe_selected = probe.select(probe_on); - CUDF_EXPECTS(std::equal(std::cbegin(_build_selected), - std::cend(_build_selected), - std::cbegin(probe_selected), - std::cend(probe_selected), + CUDF_EXPECTS(std::equal(std::cbegin(_build), + std::cend(_build), + std::cbegin(probe), + std::cend(probe), [](const auto &b, const auto &p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) - ? cudf::detail::join_kind::LEFT_JOIN - : JoinKind; - auto joined_indices = probe_join_indices(probe_selected, compare_nulls, stream); - return cudf::detail::construct_join_output_df( - probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr); + return probe_join_indices(probe, compare_nulls, stream, mr); } template -std::enable_if_t, rmm::device_vector>> +std::pair>, + std::unique_ptr>> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, null_equality compare_nulls, - rmm::cuda_stream_view stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { // Trivial left join case - exit early - if (!_hash_table && JoinKind == cudf::detail::join_kind::LEFT_JOIN) { - return get_trivial_left_join_indices(probe, stream); + if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) { + return get_trivial_left_join_indices(probe, stream, mr); } CUDF_EXPECTS(_hash_table, "Hash table of hash join is null."); - auto build_table = cudf::table_device_view::create(_build_selected, stream); + auto build_table = cudf::table_device_view::create(_build, stream); auto probe_table = cudf::table_device_view::create(probe, stream); - return cudf::detail::probe_join_hash_table( - *build_table, *probe_table, *_hash_table, compare_nulls, stream); + + constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) + ? cudf::detail::join_kind::LEFT_JOIN + : JoinKind; + auto join_indices = cudf::detail::probe_join_hash_table( + *build_table, *probe_table, *_hash_table, compare_nulls, stream, mr); + + if (JoinKind == cudf::detail::join_kind::FULL_JOIN) { + auto complement_indices = detail::get_left_join_indices_complement( + join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr); + join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + } + return join_indices; } } // namespace cudf diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index b37f228f6d3..aaa25e8f941 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -15,6 +15,9 @@ */ #pragma once +#include +#include +#include #include #include @@ -25,7 +28,7 @@ #include #include -#include +#include #include #include @@ -178,19 +181,29 @@ size_type estimate_join_output_size(table_device_view build_table, * * @param left Table of left columns to join * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the result * * @return Join output indices vector pair */ -inline std::pair, rmm::device_vector> -get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream) +inline std::pair>, + std::unique_ptr>> +get_trivial_left_join_indices( + table_view const& left, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - rmm::device_vector left_indices(left.num_rows()); - thrust::sequence(rmm::exec_policy(stream), left_indices.begin(), left_indices.end(), 0); - rmm::device_vector right_indices(left.num_rows()); - thrust::fill(rmm::exec_policy(stream), right_indices.begin(), right_indices.end(), JoinNoneValue); + auto left_indices = std::make_unique>(left.num_rows(), stream, mr); + thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0); + auto right_indices = + std::make_unique>(left.num_rows(), stream, mr); + thrust::fill( + rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); return std::make_pair(std::move(left_indices), std::move(right_indices)); } +std::pair, std::unique_ptr
> get_empty_joined_table( + table_view const& probe, table_view const& build); + std::unique_ptr combine_table_pair(std::unique_ptr&& left, std::unique_ptr&& right); @@ -207,106 +220,52 @@ struct hash_join::hash_join_impl { private: cudf::table_view _build; - cudf::table_view _build_selected; - std::vector _build_on; std::unique_ptr> _hash_table; public: /** - * @brief Constructor that internally builds the hash table based on the given `build` table and - * column indices specified by `build_on` for subsequent probe calls. + * @brief Constructor that internally builds the hash table based on the given `build` table * * @throw cudf::logic_error if the number of columns in `build` table is 0. * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE. - * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build` - * table. * * @param build The build table, from which the hash table is built. - * @param build_on The column indices from `build` to join on. * @param compare_nulls Controls whether null join-key values should match or not. */ hash_join_impl(cudf::table_view const& build, - std::vector const& build_on, null_equality compare_nulls, rmm::cuda_stream_view stream = rmm::cuda_stream_default); - std::pair, std::unique_ptr> inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::unique_ptr left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::unique_ptr full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair>, + std::unique_ptr>> + inner_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + std::pair>, + std::unique_ptr>> + left_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + std::pair>, + std::unique_ptr>> + full_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; private: - /** - * @brief Performs hash join by probing the columns provided in `probe` as per - * the joining indices given in `probe_on` and returns a (`probe`, `_build`) table pair, which - * contains the probe and build portions of the logical joined table respectively. - * - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (`P`, `B`) where `P` does not exist in `probe_on` or `B` does not exist in - * `_build_on`. - * @throw cudf::logic_error if `columns_in_common` contains a pair of indices - * (`P`, `B`) such that the location of `P` within `probe_on` is not equal to - * the location of `B` within `_build_on`. - * @throw cudf::logic_error if the number of elements in `probe_on` and - * `_build_on` are not equal. - * @throw cudf::logic_error if the number of columns in `probe` is 0. - * @throw cudf::logic_error if the number of rows in `probe` table exceeds MAX_JOIN_SIZE. - * @throw std::out_of_range if elements of `probe_on` exceed the number of columns in the `probe` - * table. - * @throw cudf::logic_error if types do not match between joining columns. - * - * @tparam JoinKind The type of join to be performed. - * - * @param probe The probe table. - * @param probe_on The column's indices from `probe` to join on. - * Column `i` from `probe_on` will be compared against column `i` of `_build_on`. - * @param columns_in_common is a vector of pairs of column indices into - * `probe` and `_build`, respectively, that are "in common". For "common" - * columns, only a single output column will be produced, which is gathered - * from `probe_on` columns. Else, for every column in `probe_on` and `_build_on`, - * an output column will be produced. For each of these pairs (P, B), P - * should exist in `probe_on` and B should exist in `_build_on`. - * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side. - * @param compare_nulls Controls whether null join-key values should match or not. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return Table pair of (`probe`, `_build`) of joining both tables on the columns - * specified by `probe_on` and `_build_on`. The resulting table pair will be joined columns of - * (`probe(including common columns)`, `_build(excluding common columns)`) if - * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`, - * `_build(including common columns)`) if `common_columns_output_side` is `BUILD`. - */ template - std::pair, std::unique_ptr> compute_hash_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; + std::pair>, + std::unique_ptr>> + compute_hash_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; /** * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, @@ -320,15 +279,17 @@ struct hash_join::hash_join_impl { * @param probe_table Table of probe side columns to join. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned vectors. * * @return Join output indices vector pair. */ template - std::enable_if_t, rmm::device_vector>> + std::pair>, + std::unique_ptr>> probe_join_indices(cudf::table_view const& probe, null_equality compare_nulls, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; }; } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index ce27cfcd616..f2e4bab02c6 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -26,68 +26,102 @@ namespace cudf { namespace detail { -std::unique_ptr
inner_join( - table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +inner_join(table_view const& left_input, + table_view const& right_input, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - {left_input.select(left_on), right_input.select(right_on)}, + {left_input, right_input}, stream, rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones - auto const left = scatter_columns(matched.second.front(), left_on, left_input); - auto const right = scatter_columns(matched.second.back(), right_on, right_input); + auto const left = matched.second.front(); + auto const right = matched.second.back(); // For `inner_join`, we can freely choose either the `left` or `right` table to use for // building/probing the hash map. Because building is typically more expensive than probing, we // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { - cudf::hash_join hj_obj(left, left_on, compare_nulls, stream); - auto actual_columns_in_common = columns_in_common; - std::for_each(actual_columns_in_common.begin(), actual_columns_in_common.end(), [](auto& pair) { - std::swap(pair.first, pair.second); - }); - auto probe_build_pair = hj_obj.inner_join(right, - right_on, - actual_columns_in_common, - cudf::hash_join::common_columns_output_side::BUILD, - compare_nulls, - stream, - mr); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.second), - std::move(probe_build_pair.first)); + cudf::hash_join hj_obj(left, compare_nulls, stream); + auto result = hj_obj.inner_join(right, compare_nulls, stream, mr); + return std::make_pair(std::move(result.second), std::move(result.first)); } else { - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - auto probe_build_pair = hj_obj.inner_join(left, - left_on, - columns_in_common, - cudf::hash_join::common_columns_output_side::PROBE, - compare_nulls, - stream, - mr); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + cudf::hash_join hj_obj(right, compare_nulls, stream); + return hj_obj.inner_join(left, compare_nulls, stream, mr); } } -std::unique_ptr
left_join( - table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
inner_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Make sure any dictionary columns have matched key sets. + // This will return any new dictionary columns created as well as updated table_views. + auto matched = cudf::dictionary::detail::match_dictionaries( + {left_input.select(left_on), right_input.select(right_on)}, + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + + // now rebuild the table views with the updated ones + auto const left = scatter_columns(matched.second.front(), left_on, left_input); + auto const right = scatter_columns(matched.second.back(), right_on, right_input); + + auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr); + std::unique_ptr
left_result = detail::gather(left, + join_indices.first->begin(), + join_indices.first->end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second->begin(), + join_indices.second->end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); +} + +std::pair>, + std::unique_ptr>> +left_join(table_view const& left_input, + table_view const& right_input, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Make sure any dictionary columns have matched key sets. + // This will return any new dictionary columns created as well as updated table_views. + auto matched = cudf::dictionary::detail::match_dictionaries( + {left_input, right_input}, // these should match + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + // now rebuild the table views with the updated ones + table_view const left = matched.second.front(); + table_view const right = matched.second.back(); + + cudf::hash_join hj_obj(right, compare_nulls, stream); + return hj_obj.left_join(left, compare_nulls, stream, mr); +} + +std::unique_ptr
left_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -99,19 +133,58 @@ std::unique_ptr
left_join( table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr); + auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls); + + if ((left_on.empty() || right_on.empty()) || + is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { + auto probe_build_pair = get_empty_joined_table(left, right); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), + std::move(probe_build_pair.second)); + } + std::unique_ptr
left_result = detail::gather(left, + join_indices.first->begin(), + join_indices.first->end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second->begin(), + join_indices.second->end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } -std::unique_ptr
full_join( - table_view const& left_input, - table_view const& right_input, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +full_join(table_view const& left_input, + table_view const& right_input, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Make sure any dictionary columns have matched key sets. + // This will return any new dictionary columns created as well as updated table_views. + auto matched = cudf::dictionary::detail::match_dictionaries( + {left_input, right_input}, // these should match + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + // now rebuild the table views with the updated ones + table_view const left = matched.second.front(); + table_view const right = matched.second.back(); + + cudf::hash_join hj_obj(right, compare_nulls, stream); + return hj_obj.full_join(left, compare_nulls, stream, mr); +} + +std::unique_ptr
full_join(table_view const& left_input, + table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. @@ -123,8 +196,27 @@ std::unique_ptr
full_join( table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - cudf::hash_join hj_obj(right, right_on, compare_nulls, stream); - return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr); + auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls); + + if ((left_on.empty() || right_on.empty()) || + is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { + auto probe_build_pair = get_empty_joined_table(left, right); + return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), + std::move(probe_build_pair.second)); + } + std::unique_ptr
left_result = detail::gather(left, + join_indices.first->begin(), + join_indices.first->end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + std::unique_ptr
right_result = detail::gather(right, + join_indices.second->begin(), + join_indices.second->end(), + out_of_bounds_policy::NULLIFY, + stream, + mr); + return combine_table_pair(std::move(left_result), std::move(right_result)); } } // namespace detail @@ -132,90 +224,111 @@ std::unique_ptr
full_join( hash_join::~hash_join() = default; hash_join::hash_join(cudf::table_view const& build, - std::vector const& build_on, null_equality compare_nulls, rmm::cuda_stream_view stream) - : impl{std::make_unique(build, build_on, compare_nulls, stream)} + : impl{std::make_unique(build, compare_nulls, stream)} { } -std::pair, std::unique_ptr> hash_join::inner_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - common_columns_output_side common_columns_output_side, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair>, + std::unique_ptr>> +hash_join::inner_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - return impl->inner_join( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); + return impl->inner_join(probe, compare_nulls, stream, mr); } -std::unique_ptr hash_join::left_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair>, + std::unique_ptr>> +hash_join::left_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); + return impl->left_join(probe, compare_nulls, stream, mr); } -std::unique_ptr hash_join::full_join( - cudf::table_view const& probe, - std::vector const& probe_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +std::pair>, + std::unique_ptr>> +hash_join::full_join(cudf::table_view const& probe, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); + return impl->full_join(probe, compare_nulls, stream, mr); } // external APIs -std::unique_ptr
inner_join( - table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +inner_join(table_view const& left, + table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::inner_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); +} + +std::unique_ptr
inner_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::inner_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr
left_join( - table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::pair>, + std::unique_ptr>> +left_join(table_view const& left, + table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::left_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); +} + +std::unique_ptr
left_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + +std::pair>, + std::unique_ptr>> +full_join(table_view const& left, + table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::full_join(left, right, compare_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr
full_join( - table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - std::vector> const& columns_in_common, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
full_join(table_view const& left, + table_view const& right, + std::vector const& left_on, + std::vector const& right_on, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::full_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index f0c158c1ef6..9312704f065 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -19,6 +19,8 @@ #include #include +#include + #include #include @@ -29,9 +31,10 @@ constexpr size_type MAX_JOIN_SIZE{std::numeric_limits::max()}; constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; -constexpr size_type JoinNoneValue = -1; +constexpr size_type JoinNoneValue = std::numeric_limits::min(); -using VectorPair = std::pair, rmm::device_vector>; +using VectorPair = std::pair>, + std::unique_ptr>>; using multimap_type = concurrent_unordered_multimap; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; -inline bool is_trivial_join(table_view const& left, - table_view const& right, - std::vector const& left_on, - std::vector const& right_on, - join_kind join_type) +inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) { // If there is nothing to join, then send empty table with all columns - if (left_on.empty() || right_on.empty()) { return true; } + if (left.is_empty() || right.is_empty()) { return true; } // If left join and the left table is empty, return immediately if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 9d046f9983c..80a1ef9e204 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -17,20 +17,106 @@ #include #include +#include + #include #include #include +#include #include #include +#include #include #include #include +#include #include #include namespace cudf { namespace detail { + +template +std::unique_ptr> left_semi_anti_join( + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty"); + CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); + + if (is_trivial_join(left_keys, right_keys, JoinKind)) { + return std::make_unique>(0, stream, mr); + } + if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) { + auto result = + std::make_unique>(left_keys.num_rows(), stream, mr); + thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end()); + return result; + } + + auto const left_num_rows = left_keys.num_rows(); + auto const right_num_rows = right_keys.num_rows(); + + // Only care about existence, so we'll use an unordered map (other joins need a multimap) + using hash_table_type = concurrent_unordered_map; + + // Create hash table containing all keys found in right table + auto right_rows_d = table_device_view::create(right_keys, stream); + size_t const hash_table_size = compute_hash_table_size(right_num_rows); + row_hash hash_build{*right_rows_d}; + row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; + + // Going to join it with left table + auto left_rows_d = table_device_view::create(left_keys, stream); + row_hash hash_probe{*left_rows_d}; + row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; + + auto hash_table_ptr = hash_table_type::create(hash_table_size, + stream, + std::numeric_limits::max(), + std::numeric_limits::max(), + hash_build, + equality_build); + auto hash_table = *hash_table_ptr; + + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + right_num_rows, + [hash_table] __device__(size_type idx) mutable { + hash_table.insert(thrust::make_pair(idx, true)); + }); + + // + // Now we have a hash table, we need to iterate over the rows of the left table + // and check to see if they are contained in the hash table + // + + // For semi join we want contains to be true, for anti join we want contains to be false + bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); + + auto gather_map = + std::make_unique>(left_num_rows, stream, mr); + + // gather_map_end will be the end of valid data in gather_map + auto gather_map_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(left_num_rows), + gather_map->begin(), + [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) { + auto pos = hash_table.find(idx, hash_probe, equality_probe); + return (pos != hash_table.end()) == join_type_boolean; + }); + + auto join_size = thrust::distance(gather_map->begin(), gather_map_end); + gather_map->resize(join_size, stream); + return gather_map; +} + /** * @brief Performs a left semi or anti join on the specified columns of two * tables (left, right) @@ -57,8 +143,6 @@ namespace detail { * The column from `right` indicated by `right_on[i]` * will be compared against the column from `left` * indicated by `left_on[i]`. - * @param[in] return_columns A vector of column indices from `left` to - * include in the returned table. * @param[in] compare_nulls Controls whether null join-key values should match or not. * @param[in] mr Device memory resource to used to allocate the returned table's * device memory @@ -66,8 +150,7 @@ namespace detail { * @tparam join_kind Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN * * @returns Result of joining `left` and `right` tables on the columns - * specified by `left_on` and `right_on`. The resulting table - * will contain `return_columns` from `left` that match in right. + * specified by `left_on` and `right_on`. */ template std::unique_ptr left_semi_anti_join( @@ -75,27 +158,19 @@ std::unique_ptr left_semi_anti_join( cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty"); - CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty"); CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on"); - if (0 == return_columns.size()) { return empty_like(left.select(return_columns)); } - - if (is_trivial_join(left, right, left_on, right_on, JoinKind)) { - return empty_like(left.select(return_columns)); + if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) { + return empty_like(left); } - auto const left_num_rows = left.num_rows(); - auto const right_num_rows = right.num_rows(); - - if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_num_rows)) { + if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) { // Everything matches, just copy the proper columns from the left table - return std::make_unique
(left.select(return_columns), stream, mr); + return std::make_unique
(left, stream, mr); } // Make sure any dictionary columns have matched key sets. @@ -108,91 +183,64 @@ std::unique_ptr left_semi_anti_join( auto const left_selected = matched.second.front(); auto const right_selected = matched.second.back(); - // Only care about existence, so we'll use an unordered map (other joins need a multimap) - using hash_table_type = concurrent_unordered_map; - - // Create hash table containing all keys found in right table - auto right_rows_d = table_device_view::create(right_selected, stream); - size_t const hash_table_size = compute_hash_table_size(right_num_rows); - row_hash hash_build{*right_rows_d}; - row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; - - // Going to join it with left table - auto left_rows_d = table_device_view::create(left_selected, stream); - row_hash hash_probe{*left_rows_d}; - row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; - - auto hash_table_ptr = hash_table_type::create(hash_table_size, - stream, - std::numeric_limits::max(), - std::numeric_limits::max(), - hash_build, - equality_build); - auto hash_table = *hash_table_ptr; - - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - right_num_rows, - [hash_table] __device__(size_type idx) mutable { - hash_table.insert(thrust::make_pair(idx, true)); - }); - - // - // Now we have a hash table, we need to iterate over the rows of the left table - // and check to see if they are contained in the hash table - // + auto gather_map = + left_semi_anti_join(left_selected, right_selected, compare_nulls, stream); - // For semi join we want contains to be true, for anti join we want contains to be false - bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN); - - rmm::device_vector gather_map(left_num_rows); - - // gather_map_end will be the end of valid data in gather_map - auto gather_map_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(left_num_rows), - gather_map.begin(), - [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) { - auto pos = hash_table.find(idx, hash_probe, equality_probe); - return (pos != hash_table.end()) == join_type_boolean; - }); - - // rebuild left table for call to gather auto const left_updated = scatter_columns(left_selected, left_on, left); - return cudf::detail::gather(left_updated.select(return_columns), - gather_map.begin(), - gather_map_end, + return cudf::detail::gather(left_updated, + gather_map->begin(), + gather_map->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); } + } // namespace detail std::unique_ptr left_semi_join(cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + +std::unique_ptr> left_semi_join( + cudf::table_view const& left, + cudf::table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::left_semi_anti_join( + left, right, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, cudf::table_view const& right, std::vector const& left_on, std::vector const& right_on, - std::vector const& return_columns, null_equality compare_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); + left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); +} + +std::unique_ptr> left_anti_join( + cudf::table_view const& left, + cudf::table_view const& right, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::left_semi_anti_join( + left, right, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index efc5330ea7d..32192234c56 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -33,11 +33,15 @@ #include #include +#include + template using column_wrapper = cudf::test::fixed_width_column_wrapper; using strcol_wrapper = cudf::test::strings_column_wrapper; using CVector = std::vector>; using Table = cudf::table; +constexpr cudf::size_type NoneValue = + std::numeric_limits::min(); // TODO: how to test if this isn't public? struct JoinTest : public cudf::test::BaseFixture { }; @@ -58,58 +62,11 @@ TEST_F(JoinTest, EmptySentinelRepro) cudf::table_view left({left_first_col, left_second_col, left_third_col}); cudf::table_view right({right_first_col, right_second_col, right_third_col}); - auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2}, {{0, 0}, {1, 1}, {2, 2}}); + auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2}); EXPECT_EQ(result->num_rows(), 1); } -TEST_F(JoinTest, InvalidCommonColumnIndices) -{ - column_wrapper col0_0{{3, 1, 2, 0, 3}}; - column_wrapper col0_1{{0, 1, 2, 4, 1}}; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}}; - - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols0.push_back(col0_1.release()); - cols1.push_back(col1_0.release()); - cols1.push_back(col1_1.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - EXPECT_THROW(cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 1}, {1, 0}}), cudf::logic_error); -} - -TEST_F(JoinTest, FullJoinNoCommon) -{ - column_wrapper col0_0{{0, 1}}; - column_wrapper col1_0{{0, 2}}; - CVector cols0, cols1; - cols0.push_back(col0_0.release()); - cols1.push_back(col1_0.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - column_wrapper exp_col0_0{{0, 1, -1}, {1, 1, 0}}; - column_wrapper exp_col0_1{{0, -1, 2}, {1, 0, 1}}; - CVector exp_cols; - exp_cols.push_back(exp_col0_0.release()); - exp_cols.push_back(exp_col0_1.release()); - Table gold(std::move(exp_cols)); - - auto result = cudf::full_join(t0, t1, {0}, {0}, {}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} - TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) { column_wrapper col0_0{{3, 1, 2, 0, 3}}; @@ -131,7 +88,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0}, {0}, {}); + auto result = cudf::left_join(t0, t1, {0}, {0}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -153,7 +110,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon) auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinNoNulls) @@ -177,24 +134,32 @@ TEST_F(JoinTest, FullJoinNoNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{2, 2, 0, 4, 3, 3, 1, 2, 0}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}); - column_wrapper col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""}, + {1, 1, 1, 1, 1, 0, 0, 0, 0}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; + strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"}, + {0, 0, 0, 0, 1, 1, 1, 1, 1}); + column_wrapper col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinWithNulls) @@ -218,24 +183,32 @@ TEST_F(JoinTest, FullJoinWithNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{2, 2, 0, -1, 3, 3, 1, 2, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}); - column_wrapper col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""}, + {1, 1, 1, 1, 1, 0, 0, 0, 0}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}; + column_wrapper col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 0}}; + strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"}, + {0, 0, 0, 0, 1, 1, 1, 1, 1}); + column_wrapper col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}; + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinOnNulls) @@ -262,7 +235,7 @@ TEST_F(JoinTest, FullJoinOnNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -273,20 +246,26 @@ TEST_F(JoinTest, FullJoinOnNulls) cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); #endif - - column_wrapper col_gold_0{{ 2, 5, 3, -1}, - { 1, 1, 1, 0}}; - strcol_wrapper col_gold_1({ "s1", "s0", "s0", "s1"}); - column_wrapper col_gold_2{{ -1, -1, 0, 1}, - { 0, 0, 1, 1}}; - column_wrapper col_gold_3{{ 1, 4, 2, 8}, - { 1, 1, 1, 1}}; + + column_wrapper col_gold_0{{ 3, -1, -1, -1}, + { 1, 0, 0, 0}}; + strcol_wrapper col_gold_1{{ "s0", "s1", "", ""}, + { 1, 1, 0, 0}}; + column_wrapper col_gold_2{{ 0, 1, -1, -1}, + { 1, 1, 0, 0}}; + column_wrapper col_gold_3{{ 3, -1, 2, 5}, + { 1, 0, 1, 1}}; + strcol_wrapper col_gold_4{{ "s0", "s1", "s1", "s0"}}; + column_wrapper col_gold_5{{ 2, 8, 1, 4}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); @@ -300,22 +279,27 @@ TEST_F(JoinTest, FullJoinOnNulls) cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); #endif - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); // Repeat test with compare_nulls_equal=false, // as per SQL standard. - result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL); + result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); sorted_result = cudf::gather(result->view(), *result_sort_order); - col_gold_0 = {{ 2, 5, 3, -1, -1}, - { 1, 1, 1, 0, 0}}; - col_gold_1 = strcol_wrapper({ "s1", "s0", "s0", "s1", "s1"}); - col_gold_2 = {{ -1, -1, 0, -1, 1}, - { 0, 0, 1, 0, 1}}; - col_gold_3 = {{ 1, 4, 2, 8, -1}, - { 1, 1, 1, 1, 0}}; + col_gold_0 = {{ 3, -1, -1, -1, -1}, + { 1, 0, 0, 0, 0}}; + col_gold_1 = strcol_wrapper{{ "s0", "s1", "", "", ""}, + { 1, 1, 0, 0, 0}}; + col_gold_2 = {{ 0, 1, -1, -1, -1}, + { 1, 1, 0, 0, 0}}; + col_gold_3 = {{ 3, -1, 2, 5, -1}, + { 1, 0, 1, 1, 0}}; + col_gold_4 = strcol_wrapper{{ "s0", "", "s1", "s0", "s1"}, + { 1, 0, 1, 1, 1}}; + col_gold_5 = {{ 2, -1, 1, 4, 8}, + { 1, 0, 1, 1, 1}}; // clang-format on @@ -324,23 +308,26 @@ TEST_F(JoinTest, FullJoinOnNulls) cols_gold_nulls_unequal.push_back(col_gold_1.release()); cols_gold_nulls_unequal.push_back(col_gold_2.release()); cols_gold_nulls_unequal.push_back(col_gold_3.release()); + cols_gold_nulls_unequal.push_back(col_gold_4.release()); + cols_gold_nulls_unequal.push_back(col_gold_5.release()); + Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)}; gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, LeftJoinNoNulls) { - column_wrapper col0_0{{3, 1, 2, 0, 3}}; + column_wrapper col0_0({3, 1, 2, 0, 3}); strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); - column_wrapper col0_2{{0, 1, 2, 4, 1}}; + column_wrapper col0_2({0, 1, 2, 4, 1}); - column_wrapper col1_0{{2, 2, 0, 4, 3}}; + column_wrapper col1_0({2, 2, 0, 4, 3}); strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); - column_wrapper col1_2{{1, 0, 1, 2, 1}}; + column_wrapper col1_2({1, 0, 1, 2, 1}); CVector cols0, cols1; cols0.push_back(col0_0.release()); @@ -353,30 +340,34 @@ TEST_F(JoinTest, LeftJoinNoNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}); - column_wrapper col_gold_2{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; + column_wrapper col_gold_0({3, 1, 2, 0, 3}); + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"}); + column_wrapper col_gold_2({0, 1, 2, 4, 1}); + column_wrapper col_gold_3{{-1, -1, -1, -1, 3}, {0, 0, 0, 0, 1}}; + strcol_wrapper col_gold_4{{"", "", "", "", "s1"}, {0, 0, 0, 0, 1}}; + column_wrapper col_gold_5{{-1, -1, -1, -1, 1}, {0, 0, 0, 0, 1}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, LeftJoinWithNulls) { column_wrapper col0_0{{3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); + strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1}); column_wrapper col0_2{{0, 1, 2, 4, 1}}; column_wrapper col1_0{{2, 2, 0, 4, 3}}; @@ -394,24 +385,29 @@ TEST_F(JoinTest, LeftJoinWithNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); - column_wrapper col_gold_0{{3, 2, 1, 2, 0}, {1, 1, 1, 1, 1}}; - strcol_wrapper col_gold_1({"s1", "s0", "s1", "", "s4"}, {1, 1, 1, 0, 1}); - column_wrapper col_gold_2{{0, 1, 1, 2, 4}, {1, 1, 1, 1, 1}}; - column_wrapper col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; + column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}}; + strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1}, {1, 1, 1, 1, 1}}; + column_wrapper col_gold_3{{3, -1, -1, -1, 2}, {1, 0, 0, 0, 1}}; + strcol_wrapper col_gold_4{{"s1", "", "", "", "s0"}, {1, 0, 0, 0, 1}}; + column_wrapper col_gold_5{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}; + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, LeftJoinOnNulls) @@ -438,7 +434,7 @@ TEST_F(JoinTest, LeftJoinOnNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); @@ -449,21 +445,27 @@ TEST_F(JoinTest, LeftJoinOnNulls) cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t"); cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t"); #endif - + column_wrapper col_gold_0{{ 3, -1, 2}, { 1, 0, 1}}; strcol_wrapper col_gold_1({ "s0", "s1", "s2"}, { 1, 1, 1}); - column_wrapper col_gold_2{{ 0, 1, 2}, + column_wrapper col_gold_2{{ 0, 1, 2}, { 1, 1, 1}}; - column_wrapper col_gold_3{{ 2, 8, -1}, + column_wrapper col_gold_3{{ 3, -1, -1}, + { 1, 0, 0}}; + strcol_wrapper col_gold_4({ "s0", "s1", ""}, + { 1, 1, 0}); + column_wrapper col_gold_5{{ 2, 8, -1}, { 1, 1, 0}}; - + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); @@ -477,23 +479,28 @@ TEST_F(JoinTest, LeftJoinOnNulls) cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t"); #endif - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); // Repeat test with compare_nulls_equal=false, // as per SQL standard. - result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL); + result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); sorted_result = cudf::gather(result->view(), *result_sort_order); - col_gold_0 = {{ 3, -1, 2}, - { 1, 0, 1}}; - col_gold_1 = strcol_wrapper({ "s0", "s1", "s2"}, - { 1, 1, 1}); - col_gold_2 = {{ 0, 1, 2}, - { 1, 1, 1}}; - col_gold_3 = {{ 2, -1, -1}, - { 1, 0, 0}}; + + col_gold_0 = {{ 3, -1, 2}, + { 1, 0, 1}}; + col_gold_1 = {{ "s0", "s1", "s2"}, + { 1, 1, 1}}; + col_gold_2 = {{ 0, 1, 2}, + { 1, 1, 1}}; + col_gold_3 = {{ 3, -1, -1}, + { 1, 0, 0}}; + col_gold_4 = {{ "s0", "", ""}, + { 1, 0, 0}}; + col_gold_5 = {{ 2, -1, -1}, + { 1, 0, 0}}; // clang-format on CVector cols_gold_nulls_unequal; @@ -506,7 +513,7 @@ TEST_F(JoinTest, LeftJoinOnNulls) gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view()); sorted_gold = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, InnerJoinSizeOverflow) @@ -529,7 +536,7 @@ TEST_F(JoinTest, InnerJoinSizeOverflow) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}), cudf::logic_error); + EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}), cudf::logic_error); } TEST_F(JoinTest, InnerJoinNoNulls) @@ -553,86 +560,28 @@ TEST_F(JoinTest, InnerJoinNoNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); column_wrapper col_gold_0{{3, 2, 2}}; strcol_wrapper col_gold_1({"s1", "s0", "s0"}); column_wrapper col_gold_2{{0, 2, 1}}; - column_wrapper col_gold_3{{1, 0, 0}}; + column_wrapper col_gold_3{{3, 2, 2}}; + strcol_wrapper col_gold_4({"s1", "s0", "s0"}); + column_wrapper col_gold_5{{1, 0, 0}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} - -TEST_F(JoinTest, InnerJoinNonAlignedCommon) -{ - CVector cols0, cols1; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); - cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1"}).release()); - cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); -} - -TEST_F(JoinTest, InnerJoinNonAlignedCommonSwap) -{ - CVector cols0, cols1; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 5}}.release()); - cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0"}).release()); - cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1, 0}}.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - auto result = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, InnerJoinWithNulls) @@ -656,37 +605,41 @@ TEST_F(JoinTest, InnerJoinWithNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); column_wrapper col_gold_0{{3, 2}}; strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1}); column_wrapper col_gold_2{{0, 1}}; - column_wrapper col_gold_3{{1, -1}, {1, 0}}; + column_wrapper col_gold_3{{3, 2}}; + strcol_wrapper col_gold_4({"s1", "s0"}, {1, 1}); + column_wrapper col_gold_5{{1, -1}, {1, 0}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } -// Test to check join behaviour when join keys are null. +// // Test to check join behaviour when join keys are null. TEST_F(JoinTest, InnerJoinOnNulls) { // clang-format off column_wrapper col0_0{{ 3, 1, 2, 0, 2}}; - strcol_wrapper col0_1({"s1", "s1", "s8", "s4", "s0"}, + strcol_wrapper col0_1({"s1", "s1", "s8", "s4", "s0"}, { 1, 1, 0, 1, 1}); column_wrapper col0_2{{ 0, 1, 2, 4, 1}}; column_wrapper col1_0{{ 2, 2, 0, 4, 3}}; - strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, { 1, 0, 1, 1, 1}); column_wrapper col1_2{{ 1, 0, 1, 2, 1}}; @@ -701,38 +654,47 @@ TEST_F(JoinTest, InnerJoinOnNulls) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); column_wrapper col_gold_0 {{ 3, 2}}; - strcol_wrapper col_gold_1 ({"s1", "s0"}, + strcol_wrapper col_gold_1 ({"s1", "s0"}, { 1, 0}); column_wrapper col_gold_2{{ 0, 2}}; - column_wrapper col_gold_3{{ 1, 0}}; + column_wrapper col_gold_3 {{ 3, 2}}; + strcol_wrapper col_gold_4 ({"s1", "s0"}, + { 1, 0}); + column_wrapper col_gold_5{{ 1, 0}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); cols_gold.push_back(col_gold_2.release()); cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); + // Repeat test with compare_nulls_equal=false, // as per SQL standard. - result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL); + result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL); result_sort_order = cudf::sorted_order(result->view()); sorted_result = cudf::gather(result->view(), *result_sort_order); col_gold_0 = {{ 3}}; - col_gold_1 = strcol_wrapper({"s1"}, + col_gold_1 = strcol_wrapper({"s1"}, { 1}); col_gold_2 = {{ 0}}; - col_gold_3 = {{ 1}}; + col_gold_3 = {{ 3}}; + col_gold_4 = strcol_wrapper({"s1"}, + { 1}); + col_gold_5 = {{ 1}}; // clang-format on @@ -741,11 +703,13 @@ TEST_F(JoinTest, InnerJoinOnNulls) cols_gold_sql.push_back(col_gold_1.release()); cols_gold_sql.push_back(col_gold_2.release()); cols_gold_sql.push_back(col_gold_3.release()); + cols_gold_sql.push_back(col_gold_4.release()); + cols_gold_sql.push_back(col_gold_5.release()); Table gold_sql(std::move(cols_gold_sql)); gold_sort_order = cudf::sorted_order(gold_sql.view()); sorted_gold = cudf::gather(gold_sql.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } // Empty Left Table @@ -766,8 +730,8 @@ TEST_F(JoinTest, EmptyLeftTableInnerJoin) Table empty0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); + auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result); } TEST_F(JoinTest, EmptyLeftTableLeftJoin) @@ -787,36 +751,8 @@ TEST_F(JoinTest, EmptyLeftTableLeftJoin) Table empty0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result); -} - -TEST_F(JoinTest, EmptyLeftTableLeftJoinNonAlignedCommon) -{ - column_wrapper col0_0; - - column_wrapper col1_0{{2, 2, 0, 4, 3}}; - column_wrapper col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - CVector cols0, cols1; - cols0.emplace_back(col0_0.release()); - cols1.emplace_back(col1_0.release()); - cols1.emplace_back(col1_1.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - column_wrapper col_gold_0; - column_wrapper col_gold_1; - - CVector cols_gold; - cols_gold.emplace_back(col_gold_0.release()); - cols_gold.emplace_back(col_gold_1.release()); - - Table gold(std::move(cols_gold)); - - auto result = cudf::left_join(t0, t1, {0}, {1}, {{0, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); + auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result); } TEST_F(JoinTest, EmptyLeftTableFullJoin) @@ -833,11 +769,29 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin) cols1.push_back(col1_0.release()); cols1.push_back(col1_1.release()); - Table empty0(std::move(cols0)); - Table t1(std::move(cols1)); + Table lhs(std::move(cols0)); + Table rhs(std::move(cols1)); + + auto result = cudf::full_join(lhs, rhs, {0, 1}, {0, 1}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}}; + column_wrapper col_gold_1{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}}; + column_wrapper col_gold_2{{2, 2, 0, 4, 3}}; + column_wrapper col_gold_3{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - auto result = cudf::full_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(t1, *result); + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } // Empty Right Table @@ -858,36 +812,8 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin) Table t0(std::move(cols0)); Table empty1(std::move(cols1)); - auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); -} - -TEST_F(JoinTest, EmptyRightTableInnerJoinNonAlignedCommon) -{ - column_wrapper col0_0{{2, 2, 0, 4, 3}}; - column_wrapper col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; - - column_wrapper col1_0; - - CVector cols0, cols1; - cols0.emplace_back(col0_0.release()); - cols0.emplace_back(col0_1.release()); - cols1.emplace_back(col1_0.release()); - - Table t0(std::move(cols0)); - Table t1(std::move(cols1)); - - column_wrapper col_gold_0; - column_wrapper col_gold_1; - - CVector cols_gold; - cols_gold.emplace_back(col_gold_0.release()); - cols_gold.emplace_back(col_gold_1.release()); - - Table gold(std::move(cols_gold)); - - auto result = cudf::inner_join(t0, t1, {1}, {0}, {{1, 0}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); + auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); } TEST_F(JoinTest, EmptyRightTableLeftJoin) @@ -907,8 +833,8 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin) Table t0(std::move(cols0)); Table empty1(std::move(cols1)); - auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); + auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result); } TEST_F(JoinTest, EmptyRightTableFullJoin) @@ -928,8 +854,8 @@ TEST_F(JoinTest, EmptyRightTableFullJoin) Table t0(std::move(cols0)); Table empty1(std::move(cols1)); - auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result); + auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result); } // Both tables empty @@ -950,8 +876,8 @@ TEST_F(JoinTest, BothEmptyInnerJoin) Table t0(std::move(cols0)); Table empty1(std::move(cols1)); - auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); + auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); } TEST_F(JoinTest, BothEmptyLeftJoin) @@ -971,8 +897,8 @@ TEST_F(JoinTest, BothEmptyLeftJoin) Table t0(std::move(cols0)); Table empty1(std::move(cols1)); - auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); + auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); } TEST_F(JoinTest, BothEmptyFullJoin) @@ -992,11 +918,11 @@ TEST_F(JoinTest, BothEmptyFullJoin) Table t0(std::move(cols0)); Table empty1(std::move(cols1)); - auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result); + auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result); } -// EqualValues X Inner,Left,Full +// // EqualValues X Inner,Left,Full TEST_F(JoinTest, EqualValuesInnerJoin) { @@ -1015,16 +941,22 @@ TEST_F(JoinTest, EqualValuesInnerJoin) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); column_wrapper col_gold_0{{0, 0, 0, 0}}; strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); + column_wrapper col_gold_2{{0, 0, 0, 0}}; + strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}); + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result); } TEST_F(JoinTest, EqualValuesLeftJoin) @@ -1044,16 +976,21 @@ TEST_F(JoinTest, EqualValuesLeftJoin) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); column_wrapper col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}}; strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); + column_wrapper col_gold_2{{0, 0, 0, 0}, {1, 1, 1, 1}}; + strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1}); + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); Table gold(std::move(cols_gold)); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result); } TEST_F(JoinTest, EqualValuesFullJoin) @@ -1073,16 +1010,21 @@ TEST_F(JoinTest, EqualValuesFullJoin) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); column_wrapper col_gold_0{{0, 0, 0, 0}}; strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}); + column_wrapper col_gold_2{{0, 0, 0, 0}}; + strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}); + CVector cols_gold; cols_gold.push_back(col_gold_0.release()); cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); Table gold(std::move(cols_gold)); - CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result); } TEST_F(JoinTest, InnerJoinCornerCase) @@ -1097,18 +1039,20 @@ TEST_F(JoinTest, InnerJoinCornerCase) Table t0(std::move(cols0)); Table t1(std::move(cols1)); - auto result = cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}); + auto result = cudf::inner_join(t0, t1, {0}, {0}); auto result_sort_order = cudf::sorted_order(result->view()); auto sorted_result = cudf::gather(result->view(), *result_sort_order); column_wrapper col_gold_0{{2, 2, 2, 2}}; + column_wrapper col_gold_1{{2, 2, 2, 2}}; CVector cols_gold; cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, HashJoinSequentialProbes) @@ -1116,129 +1060,106 @@ TEST_F(JoinTest, HashJoinSequentialProbes) CVector cols1; cols1.emplace_back(column_wrapper{{2, 2, 0, 4, 3}}.release()); cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release()); - cols1.emplace_back(column_wrapper{{1, 0, 1, 2, 1}}.release()); Table t1(std::move(cols1)); - cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL); + cudf::hash_join hash_join(t1, cudf::null_equality::EQUAL); { CVector cols0; cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); Table t0(std::move(cols0)); - auto result = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); + auto result = hash_join.full_join(t0); + + auto result_table = + cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.first->size()), + result.first->data()}, + cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.second->size()), + result.second->data()}}); + auto result_sort_order = cudf::sorted_order(result_table); + auto sorted_result = cudf::gather(result_table, *result_sort_order); + + column_wrapper col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}}; + column_wrapper col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}}; CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release()); - cols_gold.emplace_back( - strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release()); - cols_gold.emplace_back( - column_wrapper{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}} - .release()); - cols_gold.emplace_back( - column_wrapper{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}} - .release()); - Table gold(std::move(cols_gold)); + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } { CVector cols0; cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 3}}.release()); cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); Table t0(std::move(cols0)); - auto result = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}}); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); - - CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}.release()); - cols_gold.emplace_back( - strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}.release()); - Table gold(std::move(cols_gold)); - - auto gold_sort_order = cudf::sorted_order(gold.view()); - auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); - } - - { - CVector cols0; - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); - - Table t0(std::move(cols0)); + auto result = hash_join.left_join(t0); + auto result_table = + cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.first->size()), + result.first->data()}, + cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.second->size()), + result.second->data()}}); + auto result_sort_order = cudf::sorted_order(result_table); + auto sorted_result = cudf::gather(result_table, *result_sort_order); - auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}}); - auto joined_cols = probe_build_pair.first->release(); - auto build_cols = probe_build_pair.second->release(); - joined_cols.insert(joined_cols.end(), - std::make_move_iterator(build_cols.begin()), - std::make_move_iterator(build_cols.end())); - auto result = std::make_unique(std::move(joined_cols)); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); + column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; + column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - Table gold(std::move(cols_gold)); + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } { CVector cols0; cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); - cols0.emplace_back(column_wrapper{{3, 1, 2, 0, 2}}.release()); cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release()); - cols0.emplace_back(column_wrapper{{0, 1, 2, 4, 1}}.release()); Table t0(std::move(cols0)); - auto probe_build_pair = hash_join.inner_join( - t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD); - auto joined_cols = probe_build_pair.second->release(); - auto probe_cols = probe_build_pair.first->release(); - joined_cols.insert(joined_cols.end(), - std::make_move_iterator(probe_cols.begin()), - std::make_move_iterator(probe_cols.end())); - auto result = std::make_unique(std::move(joined_cols)); - auto result_sort_order = cudf::sorted_order(result->view()); - auto sorted_result = cudf::gather(result->view(), *result_sort_order); + auto result = hash_join.inner_join(t0); + auto result_table = + cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.first->size()), + result.first->data()}, + cudf::column_view{cudf::data_type{cudf::type_id::INT32}, + static_cast(result.second->size()), + result.second->data()}}); + auto result_sort_order = cudf::sorted_order(result_table); + auto sorted_result = cudf::gather(result_table, *result_sort_order); + + column_wrapper col_gold_0{{2, 4, 0}}; + column_wrapper col_gold_1{{1, 1, 4}}; CVector cols_gold; - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release()); - cols_gold.emplace_back(column_wrapper{{1, 0, 0}}.release()); - cols_gold.emplace_back(column_wrapper{{3, 2, 2}}.release()); - cols_gold.emplace_back(column_wrapper{{0, 2, 1}}.release()); - Table gold(std::move(cols_gold)); + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + Table gold(std::move(cols_gold)); auto gold_sort_order = cudf::sorted_order(gold.view()); auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } } @@ -1262,7 +1183,7 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls) auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); { - auto result = cudf::left_join(t0, t1, {0}, {0}, {}); + auto result = cudf::left_join(t0, t1, {0}, {0}); auto result_view = result->view(); auto decoded1 = cudf::dictionary::decode(result_view.column(1)); auto decoded4 = cudf::dictionary::decode(result_view.column(4)); @@ -1273,18 +1194,8 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls) decoded4->view(), result_view.column(5)}); - auto gold = cudf::left_join(g0, g1, {0}, {0}, {}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); - } - { - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - auto result_view = result->view(); - auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - std::vector result_decoded( - {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); - - auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); + auto gold = cudf::left_join(g0, g1, {0}, {0}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } } @@ -1303,17 +1214,21 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls) auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); - auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}); auto result_view = result->view(); auto decoded2 = cudf::dictionary::decode(result_view.column(2)); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); + auto decoded5 = cudf::dictionary::decode(result_view.column(5)); + std::vector result_decoded({result_view.column(0), + result_view.column(1), + decoded2->view(), + result_view.column(3), + result_view.column(4), + decoded5->view()}); auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); - auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); + auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } TEST_F(JoinDictionaryTest, InnerJoinNoNulls) @@ -1331,15 +1246,20 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls) auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); auto result_view = result->view(); auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - std::vector result_decoded( - {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); + auto decoded4 = cudf::dictionary::decode(result_view.column(4)); + std::vector result_decoded({result_view.column(0), + decoded1->view(), + result_view.column(2), + result_view.column(3), + decoded4->view(), + result_view.column(5)}); auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } @@ -1358,16 +1278,20 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls) auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()}); auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()}); - auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}); auto result_view = result->view(); auto decoded2 = cudf::dictionary::decode(result_view.column(2)); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()}); + auto decoded5 = cudf::dictionary::decode(result_view.column(5)); + std::vector result_decoded({result_view.column(0), + result_view.column(1), + decoded2->view(), + result_view.column(3), + result_view.column(4), + decoded5->view()}); auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); - auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } @@ -1386,16 +1310,21 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls) auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2}); auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_view = result->view(); auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - std::vector result_decoded( - {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)}); + auto decoded4 = cudf::dictionary::decode(result_view.column(4)); + std::vector result_decoded({result_view.column(0), + decoded1->view(), + result_view.column(2), + result_view.column(3), + decoded4->view(), + result_view.column(5)}); auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); + auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } TEST_F(JoinDictionaryTest, FullJoinWithNulls) @@ -1413,16 +1342,21 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls) auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2}); auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2}); - auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); + auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}); auto result_view = result->view(); auto decoded0 = cudf::dictionary::decode(result_view.column(0)); - std::vector result_decoded( - {decoded0->view(), result_view.column(1), result_view.column(2), result_view.column(3)}); + auto decoded3 = cudf::dictionary::decode(result_view.column(3)); + std::vector result_decoded({decoded0->view(), + result_view.column(1), + result_view.column(2), + decoded3->view(), + result_view.column(4), + result_view.column(5)}); auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); - auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}); - CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded)); + auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp index 13c74616484..8de9610b07d 100644 --- a/cpp/tests/join/semi_join_tests.cpp +++ b/cpp/tests/join/semi_join_tests.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -33,809 +34,3 @@ using column_wrapper = cudf::test::fixed_width_column_wrapper; struct JoinTest : public cudf::test::BaseFixture { }; - -TEST_F(JoinTest, LeftSemiJoin) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"quick", "composéd", "result", ""}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20, 20, 20}; - column_wrapper expect_1{5.0, .7, .7, .7}; - column_wrapper expect_2{90, 61, 62, 63}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftSemiJoin_with_a_string_key) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"quick", "result"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20}; - column_wrapper expect_1{5.0, .7}; - column_wrapper expect_2{90, 62}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftSemiJoin_with_null) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{"quick", "result"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20}; - column_wrapper expect_1{5.0, .7}; - column_wrapper expect_2{90, 62}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"accénted", "turtlé", "words"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{20, 20, 50}; - column_wrapper expect_1{.5, .5, .7}; - column_wrapper expect_2{77, 78, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_with_a_string_key) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result"}; - std::vector e_strings{"accénted", "turtlé", "composéd", "", "words"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{20, 20, 20, 20, 50}; - column_wrapper expect_1{.5, .5, .7, .7, .7}; - column_wrapper expect_2{77, 78, 61, 63, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_with_null) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{"accénted", "turtlé", "composéd", "", "words"}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{20, 20, 20, 20, 50}; - column_wrapper expect_1{.5, .5, .7, .7, .7}; - column_wrapper expect_2{77, 78, 61, 63, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftSemiAntiJoin_exceptions) -{ - std::vector b_strings{"quick", "words", "result", nullptr}; - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - // - // table_a has no columns, table_b has columns - // Let's check different permutations of passing table - // with no columns to verify that exceptions are thrown - // - EXPECT_THROW(cudf::left_semi_join(table_a, table_b, {}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_a, table_b, {}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_semi_join(table_b, table_a, {}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_b, table_a, {}, {}, {}), cudf::logic_error); - - // - // table_b has columns, so we'll pass the column checks, but - // these should fail the exception check that the number of - // join columns must be the same for each table - // - EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {0}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {0}, {}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {}, {0}, {}), cudf::logic_error); - - EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {}, {0}, {}), cudf::logic_error); -} - -TEST_F(JoinTest, LeftSemiJoin_empty_result) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{}; - column_wrapper expect_1{}; - column_wrapper expect_2{}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {}); - - EXPECT_EQ(join_table->num_columns(), 0); - EXPECT_EQ(join_table->num_rows(), 0); - - auto join_table2 = cudf::left_semi_join(table_a, table_b, {}, {}, {0, 1, 3}); - - EXPECT_EQ(join_table2->num_columns(), 3); - EXPECT_EQ(join_table2->num_rows(), 0); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_empty_result) -{ - std::vector a_strings{ - "quick", "accénted", "turtlé", "composéd", "result", "", "words"}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{}; - - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{}; - column_wrapper expect_1{}; - column_wrapper expect_2{}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {}); - - EXPECT_EQ(join_table->num_columns(), 0); - EXPECT_EQ(join_table->num_rows(), 0); - - auto join_table2 = cudf::left_anti_join(table_a, table_b, {}, {}, {0, 1, 3}); - - EXPECT_EQ(join_table2->num_columns(), 3); - EXPECT_EQ(join_table2->num_rows(), 0); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3); -} - -TEST_F(JoinTest, LeftSemiAntiJoin_empty_table) -{ - std::vector a_strings{}; - std::vector b_strings{"quick", "words", "result", nullptr}; - std::vector e_strings{}; - - column_wrapper a_0{}; - column_wrapper a_1{}; - column_wrapper a_2{}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{}; - column_wrapper expect_1{}; - column_wrapper expect_2{}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table->get_column(3), expect_3); - - auto join_table2 = cudf::left_semi_join(table_b, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(3), expect_3); - - auto join_table3 = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table3->get_column(3), expect_3); - - auto join_table4 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table4->get_column(3), expect_3); - - auto join_table5 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table5->get_column(3), expect_3); -} - -TEST_F(JoinTest, LeftAntiJoin_empty_right_table) -{ - std::vector a_strings{"quick", "words", "result", nullptr}; - std::vector b_strings{}; - std::vector e_strings{"quick", "words", "result", nullptr}; - - column_wrapper a_0{10, 20, 20, 50}; - column_wrapper a_1{5.0, .7, .7, .7}; - column_wrapper a_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper a_3( - a_strings.begin(), - a_strings.end(), - thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper b_0{}; - column_wrapper b_1{}; - column_wrapper b_2{}; - - cudf::test::strings_column_wrapper b_3( - b_strings.begin(), - b_strings.end(), - thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; })); - - column_wrapper expect_0{10, 20, 20, 50}; - column_wrapper expect_1{5.0, .7, .7, .7}; - column_wrapper expect_2{90, 75, 62, 41}; - - cudf::test::strings_column_wrapper expect_3( - e_strings.begin(), - e_strings.end(), - thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; })); - - std::vector> column_a; - column_a.push_back(a_0.release()); - column_a.push_back(a_1.release()); - column_a.push_back(a_2.release()); - column_a.push_back(a_3.release()); - - std::vector> column_b; - column_b.push_back(b_0.release()); - column_b.push_back(b_1.release()); - column_b.push_back(b_2.release()); - column_b.push_back(b_3.release()); - - cudf::table table_a(std::move(column_a)); - cudf::table table_b(std::move(column_b)); - - auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3); -} - -struct JoinDictionaryTest : public cudf::test::BaseFixture { -}; - -TEST_F(JoinDictionaryTest, LeftSemiJoin) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - { - auto result = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected); - } - { - auto result = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); - } -} - -TEST_F(JoinDictionaryTest, LeftSemiJoinWithNulls) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - - auto result = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); -} - -TEST_F(JoinDictionaryTest, LeftAntiJoin) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20}; - column_wrapper b_1{5.0, .7, .7}; - column_wrapper b_2{90, 75, 62}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - { - auto result = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected); - } - { - auto result = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); - } -} - -TEST_F(JoinDictionaryTest, LeftAntiJoinWithNulls) -{ - column_wrapper a_0{10, 20, 20, 20, 20, 20, 50}; - column_wrapper a_1{5.0, .5, .5, .7, .7, .7, .7}; - column_wrapper a_2{90, 77, 78, 61, 62, 63, 41}; - cudf::test::strings_column_wrapper a_3_w( - {"quick", "accénted", "turtlé", "composéd", "result", "", "words"}); - auto a_3 = cudf::dictionary::encode(a_3_w); - - column_wrapper b_0{10, 20, 20, 50}; - column_wrapper b_1{5.0, .7, .7, .7}; - column_wrapper b_2{90, 75, 62, 41}; - cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0}); - auto b_3 = cudf::dictionary::encode(b_3_w); - - auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()}); - auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()}); - - auto result = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - auto result_view = result->view(); - auto decoded3 = cudf::dictionary::decode(result_view.column(3)); - std::vector result_decoded( - {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()}); - - auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w}); - auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w}); - auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected); -} diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index e5501428624..4c72ba2e055 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -134,11 +134,16 @@ def copy_range(Column input_column, input_begin, input_end, target_begin) -def gather(Table source_table, Column gather_map, bool keep_index=True): +def gather( + Table source_table, + Column gather_map, + bool keep_index=True, + bool nullify=False +): if not pd.api.types.is_integer_dtype(gather_map.dtype): raise ValueError("Gather map is not integer dtype.") - if len(gather_map) > 0: + if len(gather_map) > 0 and not nullify: gm_min, gm_max = minmax(gather_map) if gm_min < -len(source_table) or gm_max >= len(source_table): raise IndexError(f"Gather map index with min {gm_min}," @@ -154,7 +159,8 @@ def gather(Table source_table, Column gather_map, bool keep_index=True): source_table_view = source_table.data_view() cdef column_view gather_map_view = gather_map.view() cdef cpp_copying.out_of_bounds_policy policy = ( - cpp_copying.out_of_bounds_policy.DONT_CHECK + cpp_copying.out_of_bounds_policy.NULLIFY if nullify + else cpp_copying.out_of_bounds_policy.DONT_CHECK ) with nogil: diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd index 10edf370f5d..c221fea926d 100644 --- a/python/cudf/cudf/_lib/cpp/join.pxd +++ b/python/cudf/cudf/_lib/cpp/join.pxd @@ -4,44 +4,40 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from libcpp.pair cimport pair from libcpp cimport bool +from libcpp.pair cimport pair +from libcpp.memory cimport unique_ptr +from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type +from rmm._lib.device_uvector cimport device_uvector +ctypedef unique_ptr[device_uvector[size_type]] gather_map_type + cdef extern from "cudf/join.hpp" namespace "cudf" nogil: - cdef unique_ptr[table] inner_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, - const vector[pair[int, int]] columns_in_common + cdef pair[gather_map_type, gather_map_type] inner_join( + const table_view left_keys, + const table_view right_keys, ) except + - cdef unique_ptr[table] left_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, - const vector[pair[int, int]] columns_in_common + + cdef pair[gather_map_type, gather_map_type] left_join( + const table_view left_keys, + const table_view right_keys, ) except + - cdef unique_ptr[table] full_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, - const vector[pair[int, int]] columns_in_common + + cdef pair[gather_map_type, gather_map_type] full_join( + const table_view left_keys, + const table_view right_keys, ) except + - cdef unique_ptr[table] left_semi_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, - const vector[int] return_columns + + cdef gather_map_type left_semi_join( + const table_view left_keys, + const table_view right_keys, ) except + - cdef unique_ptr[table] left_anti_join( - const table_view left, - const table_view right, - const vector[int] left_on, - const vector[int] right_on, - const vector[int] return_columns + + cdef gather_map_type left_anti_join( + const table_view left_keys, + const table_view right_keys, ) except + diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/cpp/table/table_view.pxd index 2f386d337cd..7bbfa69836c 100644 --- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd +++ b/python/cudf/cudf/_lib/cpp/table/table_view.pxd @@ -15,6 +15,7 @@ cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil: column_view column(size_type column_index) except + size_type num_columns() except + size_type num_rows() except + + table_view select(vector[size_type] column_indices) except + cdef cppclass mutable_table_view: mutable_table_view() except + diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 38f13b9f994..69b8004cede 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -1,222 +1,88 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +import cudf + from collections import OrderedDict from itertools import chain -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport unique_ptr, make_unique from libcpp.utility cimport move from libcpp.vector cimport vector from libcpp.pair cimport pair from libcpp cimport bool +from cudf._lib.column cimport Column from cudf._lib.table cimport Table, columns_from_ptr +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport size_type, data_type, type_id from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cimport cudf._lib.cpp.join as cpp_join -cpdef join(Table lhs, - Table rhs, - object how, - object method, - object left_on=None, - object right_on=None, - bool left_index=False, - bool right_index=False - ): - """ - Call libcudf++ join for full outer, inner and left joins. - """ - - cdef Table c_lhs = lhs - cdef Table c_rhs = rhs - - # Views might or might not include index - cdef table_view lhs_view - cdef table_view rhs_view - - # Will hold the join column indices into L and R tables - cdef vector[int] left_on_ind - cdef vector[int] right_on_ind - - # If left/right index, will pass a full view - # must offset the data column indices by # of index columns - num_inds_left = len(left_on) + (lhs._num_indices * left_index) - num_inds_right = len(right_on) + (rhs._num_indices * right_index) - left_on_ind.reserve(num_inds_left) - right_on_ind.reserve(num_inds_right) - - # Only used for semi or anti joins - # The result columns are only the left hand columns - cdef vector[int] all_left_inds = range( - lhs._num_columns + (lhs._num_indices * left_index) - ) - cdef vector[int] all_right_inds = range( - rhs._num_columns + (rhs._num_indices * right_index) - ) - result_col_names = compute_result_col_names(lhs, rhs, how) - - columns_in_common = OrderedDict() - cdef vector[pair[int, int]] c_columns_in_common - - # keep track of where the desired index column will end up - result_index_pos = None - if left_index or right_index: - # If either true, we need to process both indices as columns - lhs_view = c_lhs.view() - rhs_view = c_rhs.view() - - left_join_cols = list(lhs._index_names) + list(lhs._data.keys()) - right_join_cols = list(rhs._index_names) + list(rhs._data.keys()) - if left_index and right_index: - # Index columns will be common, on the left, dropped from right - # Index name is from the left - # Both views, must take index column indices - left_on_indices = right_on_indices = range(lhs._num_indices) - result_idx_positions = range(lhs._num_indices) - result_index_names = lhs._index_names - - elif left_index: - # Joins left index columns with right 'on' columns - left_on_indices = range(lhs._num_indices) - right_on_indices = [ - right_join_cols.index(on_col) for on_col in right_on - ] - - # The left index columns 'become' the new RHS columns - # and the right index 'survives' - result_idx_positions = range( - len(left_join_cols), len(left_join_cols) + lhs._num_indices - ) - result_index_names = rhs._index_names - - # but since the common columns are gathered from the left - # the rhs 'on' cols are returned on the left of the result - # rearrange the names so account for this - common = [None] * rhs._num_indices - for i in range(rhs._num_indices): - common[i] = result_col_names.pop( - result_col_names.index(right_on[i]) - ) - result_col_names = common + result_col_names - elif right_index: - # Joins right index columns with left 'on' columns - right_on_indices = range(rhs._num_indices) - left_on_indices = [ - left_join_cols.index(on_col) for on_col in left_on - ] - - # The right index columns 'become' the new LHS columns - # and the left index survives - # since they are already gathered from the left, - # no rearranging has to be done - result_idx_positions = range(lhs._num_indices) - result_index_names = lhs._index_names - for i_l, i_r in zip(left_on_indices, right_on_indices): - left_on_ind.push_back(i_l) - right_on_ind.push_back(i_r) - columns_in_common[(i_l, i_r)] = None - else: - # cuDF's Python layer will create a new RangeIndex for this case - lhs_view = c_lhs.data_view() - rhs_view = c_rhs.data_view() - - left_join_cols = list(lhs._data.keys()) - right_join_cols = list(rhs._data.keys()) - - # If both left/right_index, joining on indices plus additional cols - # If neither, joining on just cols, not indices - # In both cases, must match up additional column indices in lhs/rhs - if left_index == right_index: - for name in left_on: - left_on_ind.push_back(left_join_cols.index(name)) - if name in right_on: - if (left_on.index(name) == right_on.index(name)): - columns_in_common[( - left_join_cols.index(name), - right_join_cols.index(name) - )] = None - for name in right_on: - right_on_ind.push_back(right_join_cols.index(name)) - c_columns_in_common = list(columns_in_common.keys()) - cdef unique_ptr[table] c_result - if how == 'inner': - with nogil: - c_result = move(cpp_join.inner_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - c_columns_in_common - )) - elif how == 'left': - with nogil: - c_result = move(cpp_join.left_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - c_columns_in_common - )) - elif how == 'outer': - with nogil: - c_result = move(cpp_join.full_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - c_columns_in_common - )) - elif how == 'leftsemi': - with nogil: - c_result = move(cpp_join.left_semi_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - all_left_inds - )) - elif how == 'leftanti': - with nogil: - c_result = move(cpp_join.left_anti_join( - lhs_view, - rhs_view, - left_on_ind, - right_on_ind, - all_left_inds - )) - - all_cols_py = columns_from_ptr(move(c_result)) - if left_index or right_index: - ind_cols = OrderedDict() - for name, pos in zip( - result_index_names[::-1], result_idx_positions[::-1] - ): - ind_cols[name] = all_cols_py.pop(pos) - index = OrderedDict() - for k, v in reversed(ind_cols.items()): - index[k] = v - index = Table(index) +# The functions below return the *gathermaps* that represent +# the join result when joining on the keys `lhs` and `rhs`. + +cpdef join(Table lhs, Table rhs, how=None): + cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result + cdef table_view c_lhs = lhs.view() + cdef table_view c_rhs = rhs.view() + + if how == "inner": + c_result = move(cpp_join.inner_join( + c_lhs, + c_rhs + )) + elif how == "left": + c_result = move(cpp_join.left_join( + c_lhs, + c_rhs + )) + elif how == "outer": + c_result = move(cpp_join.full_join( + c_lhs, + c_rhs + )) else: - index = None - data_ordered_dict = OrderedDict(zip(result_col_names, all_cols_py)) - return Table(data=data_ordered_dict, index=index) - - -def compute_result_col_names(lhs, rhs, how): - """ - Determine the names of the data columns in the result of - a libcudf join, based on the original left and right frames - as well as the type of join that was performed. - """ - if how in {"left", "inner", "outer", "leftsemi", "leftanti"}: - a = lhs._data.keys() - if how not in {"leftsemi", "leftanti"}: - return list(chain(a, (k for k in rhs._data.keys() - if k not in lhs._data.keys()))) - return list(a) + raise ValueError(f"Invalid join type {how}") + + cdef Column left_rows = _gather_map_as_column(move(c_result.first)) + cdef Column right_rows = _gather_map_as_column(move(c_result.second)) + return left_rows, right_rows + + +cpdef semi_join(Table lhs, Table rhs, how=None): + # left-semi and left-anti joins + cdef cpp_join.gather_map_type c_result + cdef table_view c_lhs = lhs.view() + cdef table_view c_rhs = rhs.view() + + if how == "leftsemi": + c_result = move(cpp_join.left_semi_join( + c_lhs, + c_rhs + )) + elif how == "leftanti": + c_result = move(cpp_join.left_anti_join( + c_lhs, + c_rhs + )) else: - raise NotImplementedError( - f"{how} merge not supported yet" - ) + raise ValueError(f"Invalid join type {how}") + + cdef Column left_rows = _gather_map_as_column(move(c_result)) + return ( + left_rows, + None + ) + + +cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): + # helple to convert a gather map to a Column + cdef size_type size = gather_map.get()[0].size() + cdef unique_ptr[column] c_col = make_unique[column]( + data_type(type_id.INT32), + size, + gather_map.get()[0].release()) + return Column.from_unique_ptr(move(c_col)) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 39c278d2abf..bb1bf3c5d5c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -750,6 +750,9 @@ def _set_categories( ordered=ordered, ) + def _decategorize(self) -> ColumnBase: + return self._column._get_decategorized_column() + class CategoricalColumn(column.ColumnBase): """Implements operations for Columns of Categorical type diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dd06d97d105..e59b395ec0f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -827,7 +827,12 @@ def quantile( def median(self, skipna: bool = None) -> ScalarLike: raise TypeError(f"cannot perform median with type {self.dtype}") - def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T: + def take( + self: T, + indices: ColumnBase, + keep_index: bool = True, + nullify: bool = False, + ) -> T: """Return Column by taking values from the corresponding *indices*. """ # Handle zero size @@ -836,7 +841,7 @@ def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T: try: return ( self.as_frame() - ._gather(indices, keep_index=keep_index) + ._gather(indices, keep_index=keep_index, nullify=nullify) ._as_column() ) except RuntimeError as e: @@ -1004,7 +1009,9 @@ def sort_by_values( ascending: bool = True, na_position: builtins.str = "last", ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]: - col_inds = self.as_frame()._get_sorted_inds(ascending, na_position) + col_inds = self.as_frame()._get_sorted_inds( + ascending=ascending, na_position=na_position + ) col_keys = self.take(col_inds) return col_keys, col_inds @@ -1016,6 +1023,9 @@ def distinct_count( raise NotImplementedError(msg) return cpp_distinct_count(self, ignore_nulls=dropna) + def can_cast_safely(self, to_dtype: Dtype) -> bool: + raise NotImplementedError() + def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_numerical_dtype(dtype): return self.as_numerical_column(dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7ad6eed65a8..da77517c75d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -362,7 +362,9 @@ def _numeric_quantile( ) -> NumericalColumn: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls - sorted_indices = self.as_frame()._get_sorted_inds(True, "first") + sorted_indices = self.as_frame()._get_sorted_inds( + ascending=True, na_position="first" + ) sorted_indices = sorted_indices[self.null_count :] return cpp_quantile(self, quant, interpolation, sorted_indices, exact) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b5f57356698..01b96151485 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4497,12 +4497,9 @@ def merge( else: lsuffix, rsuffix = suffixes - lhs = self.copy(deep=False) - rhs = right.copy(deep=False) - # Compute merge - gdf_result = super(DataFrame, lhs)._merge( - rhs, + gdf_result = super()._merge( + right, on=on, left_on=left_on, right_on=right_on, @@ -4510,8 +4507,6 @@ def merge( right_index=right_index, how=how, sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, method=method, indicator=indicator, suffixes=suffixes, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ecff3dee573..fb746d6c794 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -20,6 +20,7 @@ from cudf import _lib as libcudf from cudf._typing import ColumnLike, DataFrameOrSeries from cudf.core.column import as_column, build_categorical_column, column_empty +from cudf.core.join import merge from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, @@ -595,7 +596,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res - def _get_columns_by_label(self, labels, downcast): + def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -612,15 +613,18 @@ def _get_columns_by_index(self, indices): data, columns=data.to_pandas_index(), index=self.index ) - def _gather(self, gather_map, keep_index=True): + def _gather(self, gather_map, keep_index=True, nullify=False): if not pd.api.types.is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") result = self.__class__._from_table( libcudf.copying.gather( - self, as_column(gather_map), keep_index=keep_index + self, + as_column(gather_map), + keep_index=keep_index, + nullify=nullify, ) ) - result._copy_type_metadata(self) + result._copy_type_metadata(self, include_index=keep_index) if keep_index and self._index is not None: result._index.names = self._index.names return result @@ -2754,12 +2758,15 @@ def searchsorted( else: return result - def _get_sorted_inds(self, ascending=True, na_position="last"): + def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): """ Sort by the values. Parameters ---------- + by: list, optional + Labels specifying columns to sort by. By default, + sort by all columns of `self` ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ @@ -2794,11 +2801,17 @@ def _get_sorted_inds(self, ascending=True, na_position="last"): ) na_position = 0 + to_sort = ( + self + if by is None + else self._get_columns_by_label(by, downcast=False) + ) + # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): - ascending = [ascending] * self._num_columns + ascending = [ascending] * to_sort._num_columns - return libcudf.sort.order_by(self, ascending, na_position) + return libcudf.sort.order_by(to_sort, ascending, na_position) def sin(self): """ @@ -3329,77 +3342,6 @@ def sqrt(self): """ return self._unaryop("sqrt") - @staticmethod - def _validate_merge_cfg( - lhs, - rhs, - left_on, - right_on, - on, - how, - left_index=False, - right_index=False, - lsuffix=None, - rsuffix=None, - ): - """ - Error for various combinations of merge input parameters - """ - len_left_on = len(left_on) if left_on is not None else 0 - len_right_on = len(right_on) if right_on is not None else 0 - - # must actually support the requested merge type - if how not in ["left", "inner", "outer", "leftanti", "leftsemi"]: - raise NotImplementedError(f"{how} merge not supported yet") - - # Passing 'on' with 'left_on' or 'right_on' is potentially ambiguous - if on: - if left_on or right_on: - raise ValueError( - 'Can only pass argument "on" OR "left_on" ' - 'and "right_on", not a combination of both.' - ) - - # Require same total number of columns to join on in both operands - if not (len_left_on + left_index * len(lhs.index.names)) == ( - len_right_on + right_index * len(rhs.index.names) - ): - raise ValueError( - "Merge operands must have same number of join key columns" - ) - - # If nothing specified, must have common cols to use implicitly - same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys()) - if not (left_index or right_index): - if not (left_on or right_on): - if len(same_named_columns) == 0: - raise ValueError("No common columns to perform merge on") - - for name in same_named_columns: - if not ( - name in left_on - and name in right_on - and (left_on.index(name) == right_on.index(name)) - ): - if not (lsuffix or rsuffix): - raise ValueError( - "there are overlapping columns but " - "lsuffix and rsuffix are not defined" - ) - - if on: - on_keys = [on] if not isinstance(on, list) else on - for key in on_keys: - if not (key in lhs._data.keys() and key in rhs._data.keys()): - raise KeyError(f"Key {on} not in both operands") - else: - for key in left_on: - if key not in lhs._data.keys(): - raise KeyError(f'Key "{key}" not in left operand') - for key in right_on: - if key not in rhs._data.keys(): - raise KeyError(f'Key "{key}" not in right operand') - def _merge( self, right, @@ -3410,84 +3352,33 @@ def _merge( right_index=False, how="inner", sort=False, - lsuffix=None, - rsuffix=None, method="hash", indicator=False, suffixes=("_x", "_y"), ): - # Merge doesn't support right, so just swap + lhs, rhs = self, right if how == "right": - return right._merge( - self, - on=on, - left_on=right_on, - right_on=left_on, - left_index=right_index, - right_index=left_index, - how="left", - sort=sort, - lsuffix=rsuffix, - rsuffix=lsuffix, - method=method, - indicator=indicator, - suffixes=suffixes, - ) - - lhs = self - rhs = right - - from cudf.core.join import Merge - - mergeop = Merge( + # Merge doesn't support right, so just swap + how = "left" + lhs, rhs = right, self + left_on, right_on = right_on, left_on + left_index, right_index = right_index, left_index + suffixes = (suffixes[1], suffixes[0]) + + return merge( lhs, rhs, - on, - left_on, - right_on, - left_index, - right_index, - how, - sort, - lsuffix, - rsuffix, - method, - indicator, - suffixes, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + sort=sort, + method=method, + indicator=indicator, + suffixes=suffixes, ) - to_return = mergeop.perform_merge() - - # If sort=True, Pandas would sort on the key columns in the - # same order as given in 'on'. If the indices are used as - # keys, the index will be sorted. If one index is specified, - # the key column on the other side will be used to sort. - # If no index is specified, return a new RangeIndex - if sort: - to_sort = cudf.DataFrame() - if left_index and right_index: - by = list(to_return._index._data.columns) - if left_on and right_on: - by.extend(to_return[mergeop.left_on]._data.columns) - elif left_index: - by = list(to_return[mergeop.right_on]._data.columns) - elif right_index: - by = list(to_return[mergeop.left_on]._data.columns) - else: - # left_on == right_on, or different names but same columns - # in both cases we can sort by either - by = [to_return._data[name] for name in mergeop.left_on] - for i, col in enumerate(by): - to_sort[i] = col - inds = to_sort.argsort() - if isinstance(to_return, cudf.Index): - to_return = to_return.take(inds) - else: - to_return = to_return.take( - inds, keep_index=(left_index or right_index) - ) - return to_return - else: - return to_return def _is_sorted(self, ascending=None, null_position=None): """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2a5d2647e95..5104629eee0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -13,6 +13,7 @@ from pandas._config import get_option import cudf +from cudf._typing import DtypeObj from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, @@ -66,6 +67,9 @@ def _to_frame(this_index, index=True, name=None): class Index(Frame, Serializable): + + dtype: DtypeObj + def __new__( cls, data=None, @@ -1544,6 +1548,10 @@ def _from_table(cls, table): else: return as_index(table) + @classmethod + def _from_data(cls, data, index=None): + return cls._from_table(Frame(data=data)) + _accessors = set() # type: Set[Any] @property diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py index 6d126c8af4d..0463b8f9df1 100644 --- a/python/cudf/cudf/core/join/__init__.py +++ b/python/cudf/cudf/core/join/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2020, NVIDIA CORPORATION. -from cudf.core.join.join import Merge +from cudf.core.join.join import merge diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py new file mode 100644 index 00000000000..3807f408369 --- /dev/null +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -0,0 +1,203 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +from __future__ import annotations + +import collections +import warnings +from typing import TYPE_CHECKING, Any, Iterable, Tuple + +import numpy as np +import pandas as pd + +import cudf +from cudf.core.dtypes import CategoricalDtype + +if TYPE_CHECKING: + from cudf.core.column import CategoricalColumn, ColumnBase + from cudf.core.frame import Frame + + +class _Indexer: + # Indexer into a column (either a data column or index level). + # + # >>> df + # a + # b + # 4 1 + # 5 2 + # 6 3 + # >>> _Indexer("a", column=True).get(df) # returns column "a" of df + # >>> _Indexer("b", index=True).get(df) # returns index level "b" of df + + def __init__(self, name: Any, column=False, index=False): + if column and index: + raise ValueError("Cannot specify both column and index") + self.name = name + self.column, self.index = column, index + + def get(self, obj: Frame) -> ColumnBase: + # get the column from `obj` + if self.column: + return obj._data[self.name] + else: + if obj._index is not None: + return obj._index._data[self.name] + raise KeyError() + + def set(self, obj: Frame, value: ColumnBase, validate=False): + # set the colum in `obj` + if self.column: + obj._data.set_by_label(self.name, value, validate=validate) + else: + if obj._index is not None: + obj._index._data.set_by_label( + self.name, value, validate=validate + ) + else: + raise KeyError() + + +def _frame_select_by_indexers( + frame: Frame, indexers: Iterable[_Indexer] +) -> Frame: + # Select columns from the given `Frame` using `indexers`, + # and return a new `Frame`. + index_data = frame._data.__class__() + data = frame._data.__class__() + + for idx in indexers: + if idx.index: + index_data.set_by_label(idx.name, idx.get(frame), validate=False) + else: + data.set_by_label(idx.name, idx.get(frame), validate=False) + + result_index = cudf.Index._from_data(index_data) if index_data else None + result = cudf.core.frame.Frame(data=data, index=result_index) + return result + + +def _match_join_keys( + lcol: ColumnBase, rcol: ColumnBase, how: str +) -> Tuple[ColumnBase, ColumnBase]: + # returns the common dtype that lcol and rcol should be casted to, + # before they can be used as left and right join keys. + # If no casting is necessary, returns None + + common_type = None + + # cast the keys lcol and rcol to a common dtype + ltype = lcol.dtype + rtype = rcol.dtype + + # if either side is categorical, different logic + if isinstance(ltype, CategoricalDtype) or isinstance( + rtype, CategoricalDtype + ): + return _match_categorical_dtypes(lcol, rcol, how) + + if pd.api.types.is_dtype_equal(ltype, rtype): + return lcol, rcol + + if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)): + common_type = ( + max(ltype, rtype) + if ltype.kind == rtype.kind + else np.find_common_type([], (ltype, rtype)) + ) + + elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( + rtype, np.datetime64 + ): + common_type = max(ltype, rtype) + + if how == "left": + if rcol.fillna(0).can_cast_safely(ltype): + return lcol, rcol.astype(ltype) + else: + warnings.warn( + f"Can't safely cast column from {rtype} to {ltype}, " + "upcasting to {common_type}." + ) + + return lcol.astype(common_type), rcol.astype(common_type) + + +def _match_categorical_dtypes( + lcol: ColumnBase, rcol: ColumnBase, how: str +) -> Tuple[ColumnBase, ColumnBase]: + # cast the keys lcol and rcol to a common dtype + # when at least one of them is a categorical type + ltype, rtype = lcol.dtype, rcol.dtype + + if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance( + rcol, cudf.core.column.CategoricalColumn + ): + # if both are categoricals, logic is complicated: + return _match_categorical_dtypes_both(lcol, rcol, how) + + if isinstance(ltype, CategoricalDtype): + if how in {"left", "leftsemi", "leftanti"}: + return lcol, rcol.astype(ltype) + common_type = ltype.categories.dtype + elif isinstance(rtype, CategoricalDtype): + common_type = rtype.categories.dtype + return lcol.astype(common_type), rcol.astype(common_type) + + +def _match_categorical_dtypes_both( + lcol: CategoricalColumn, rcol: CategoricalColumn, how: str +) -> Tuple[ColumnBase, ColumnBase]: + # The commontype depends on both `how` and the specifics of the + # categorical variables to be merged. + + ltype, rtype = lcol.dtype, rcol.dtype + + # when both are ordered and both have the same categories, + # no casting required: + if ltype == rtype: + return lcol, rcol + + # Merging categorical variables when only one side is ordered is + # ambiguous and not allowed. + if ltype.ordered != rtype.ordered: + raise TypeError( + "Merging on categorical variables with mismatched" + " ordering is ambiguous" + ) + + if ltype.ordered and rtype.ordered: + # if we get to here, categories must be what causes the + # dtype equality check to fail. And we can never merge + # two ordered categoricals with different categories + raise TypeError( + f"{how} merge between categoricals with " + "different categories is only valid when " + "neither side is ordered" + ) + + # the following should now always hold + assert not ltype.ordered and not rtype.ordered + + if how == "inner": + # cast to category types -- we must cast them back later + return _match_join_keys( + lcol.cat()._decategorize(), rcol.cat()._decategorize(), how, + ) + elif how in {"left", "leftanti", "leftsemi"}: + # always cast to left type + return lcol, rcol.astype(ltype) + else: + # merge categories + merged_categories = cudf.concat( + [ltype.categories, rtype.categories] + ).unique() + common_type = cudf.CategoricalDtype( + categories=merged_categories, ordered=False + ) + return lcol.astype(common_type), rcol.astype(common_type) + + +def _coerce_to_tuple(obj): + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return tuple(obj) + else: + return (obj,) diff --git a/python/cudf/cudf/core/join/casting_logic.py b/python/cudf/cudf/core/join/casting_logic.py deleted file mode 100644 index eb85cecd14d..00000000000 --- a/python/cudf/cudf/core/join/casting_logic.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -import warnings - -import numpy as np -import pandas as pd - -import cudf -from cudf.core.dtypes import CategoricalDtype - - -def _input_to_libcudf_castrules_both_cat(lcol, rcol, how): - """ - Based off the left and right operands, determine the libcudf - merge dtype or error for corner cases where the merge cannot - proceed. This function handles categorical variables. - Categorical variable typecasting logic depends on both `how` - and the specifics of the categorical variables to be merged. - Merging categorical variables when only one side is ordered - is ambiguous and not allowed. Merging when both categoricals - are ordered is allowed, but only when the categories are - exactly equal and have equal ordering, and will result in the - common dtype. - When both sides are unordered, the result categorical depends - on the kind of join: - - For inner joins, the result will be the intersection of the - categories - - For left or right joins, the result will be the the left or - right dtype respectively. This extends to semi and anti joins. - - For outer joins, the result will be the union of categories - from both sides. - - """ - ltype = lcol.dtype - rtype = rcol.dtype - - # this function is only to be used to resolve the result when both - # sides are categorical - if not isinstance(ltype, CategoricalDtype) and isinstance( - rtype, CategoricalDtype - ): - raise TypeError("Both operands must be CategoricalDtype") - - # true for every configuration - if ltype == rtype: - return ltype - - # raise for any join where ordering doesn't match - if ltype.ordered != rtype.ordered: - raise TypeError( - "Merging on categorical variables with mismatched" - " ordering is ambiguous" - ) - elif ltype.ordered and rtype.ordered: - # if we get to here, categories must be what causes the - # dtype equality check to fail. And we can never merge - # two ordered categoricals with different categories - raise TypeError( - f"{how} merge between categoricals with " - "different categories is only valid when " - "neither side is ordered" - ) - - elif how == "inner": - # neither ordered, so categories must be different - # demote to underlying types - return _input_to_libcudf_castrules_any( - ltype.categories, rtype.categories, how - ) - - elif how == "left": - return ltype - elif how == "right": - return rtype - - elif how == "outer": - new_cats = cudf.concat([ltype.categories, rtype.categories]).unique() - return cudf.CategoricalDtype(categories=new_cats, ordered=False) - - -def _input_to_libcudf_castrules_any_cat(lcol, rcol, how): - - l_is_cat = isinstance(lcol.dtype, CategoricalDtype) - r_is_cat = isinstance(rcol.dtype, CategoricalDtype) - - if l_is_cat and r_is_cat: - return _input_to_libcudf_castrules_both_cat(lcol, rcol, how) - elif l_is_cat or r_is_cat: - if l_is_cat and how == "left": - return lcol.dtype - if r_is_cat and how == "right": - return rcol.dtype - return ( - lcol.dtype.categories.dtype - if l_is_cat - else rcol.dtype.categories.dtype - ) - else: - raise ValueError("Neither operand is categorical") - - -def _input_to_libcudf_castrules_any(lcol, rcol, how): - """ - Determine what dtype the left and right hand - input columns must be cast to for a libcudf - join to proceed. - """ - - cast_warn = ( - "can't safely cast column from {} with type" - " {} to {}, upcasting to {}" - ) - - ltype = lcol.dtype - rtype = rcol.dtype - - # if either side is categorical, different logic - if isinstance(ltype, CategoricalDtype) or isinstance( - rtype, CategoricalDtype - ): - return _input_to_libcudf_castrules_any_cat(lcol, rcol, how) - - libcudf_join_type = None - if pd.api.types.is_dtype_equal(ltype, rtype): - libcudf_join_type = ltype - elif how == "left": - check_col = rcol.fillna(0) - if not check_col.can_cast_safely(ltype): - libcudf_join_type = _input_to_libcudf_castrules_any( - lcol, rcol, "inner" - ) - warnings.warn( - cast_warn.format("right", rtype, ltype, libcudf_join_type) - ) - else: - libcudf_join_type = ltype - elif how == "right": - check_col = lcol.fillna(0) - if not check_col.can_cast_safely(rtype): - libcudf_join_type = _input_to_libcudf_castrules_any( - lcol, rcol, "inner" - ) - warnings.warn( - cast_warn.format("left", ltype, rtype, libcudf_join_type) - ) - else: - libcudf_join_type = rtype - elif how in {"inner", "outer"}: - if (np.issubdtype(ltype, np.number)) and ( - np.issubdtype(rtype, np.number) - ): - if ltype.kind == rtype.kind: - # both ints or both floats - libcudf_join_type = max(ltype, rtype) - else: - libcudf_join_type = np.find_common_type([], [ltype, rtype]) - elif np.issubdtype(ltype, np.datetime64) and np.issubdtype( - rtype, np.datetime64 - ): - libcudf_join_type = max(ltype, rtype) - return libcudf_join_type - - -def _libcudf_to_output_castrules(lcol, rcol, how): - """ - Determine what dtype an output merge key column should be - cast to after it has been processed by libcudf. Determine - if a column should be promoted to a categorical datatype. - For inner merges between unordered categoricals, we get a - new categorical variable containing the intersection of - the two source variables. For left or right joins, we get - the original categorical variable from whichever was the - major operand of the join, e.g. left for a left join or - right for a right join. In the case of an outer join, the - result will be a new categorical variable with both sets - of categories. - """ - merge_return_type = None - - ltype = lcol.dtype - rtype = rcol.dtype - - if pd.api.types.is_dtype_equal(ltype, rtype): - return ltype - - l_is_cat = isinstance(ltype, CategoricalDtype) - r_is_cat = isinstance(rtype, CategoricalDtype) - - # we currently only need to do this for categorical variables - if how == "inner": - if l_is_cat and r_is_cat: - merge_return_type = "category" - elif how == "left": - if l_is_cat: - merge_return_type = ltype - elif how == "right": - if r_is_cat: - merge_return_type = rtype - elif how == "outer": - if l_is_cat and r_is_cat: - new_cats = cudf.concat( - [ltype.categories, rtype.categories] - ).unique() - merge_return_type = cudf.CategoricalDtype( - categories=new_cats, ordered=ltype.ordered - ) - return merge_return_type diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index c6da3ee8dc4..1a4826d0570 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,22 +1,85 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. -import itertools +from __future__ import annotations -import pandas as pd +import functools +from collections import namedtuple +from typing import TYPE_CHECKING, Callable, Tuple import cudf from cudf import _lib as libcudf -from cudf._lib.join import compute_result_col_names -from cudf.core.join.casting_logic import ( - _input_to_libcudf_castrules_any, - _libcudf_to_output_castrules, +from cudf.core.join._join_helpers import ( + _coerce_to_tuple, + _frame_select_by_indexers, + _Indexer, + _match_join_keys, ) +if TYPE_CHECKING: + from cudf.core.frame import Frame + + +def merge( + lhs, + rhs, + *, + on, + left_on, + right_on, + left_index, + right_index, + how, + sort, + method, + indicator, + suffixes, +): + if how in {"leftsemi", "leftanti"}: + merge_cls = MergeSemi + else: + merge_cls = Merge + mergeobj = merge_cls( + lhs, + rhs, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + sort=sort, + method=method, + indicator=indicator, + suffixes=suffixes, + ) + return mergeobj.perform_merge() + + +_JoinKeys = namedtuple("JoinKeys", ["left", "right"]) + class Merge(object): + # A namedtuple of indexers representing the left and right keys + _keys: _JoinKeys + + # The joiner function must have the following signature: + # + # def joiner( + # lhs: Frame, + # rhs: Frame + # ) -> Tuple[Optional[Column], Optional[Column]]: + # ... + # + # where `lhs` and `rhs` are Frames composed of the left and right + # join key. The `joiner` returns a tuple of two Columns + # representing the rows to gather from the left- and right- side + # tables respectively. + _joiner: Callable + def __init__( self, lhs, rhs, + *, on, left_on, right_on, @@ -24,8 +87,6 @@ def __init__( right_index, how, sort, - lsuffix, - rsuffix, method, indicator, suffixes, @@ -60,140 +121,252 @@ def __init__( sort : bool Boolean flag indicating if the output Frame is to be sorted on the output's join keys, in left to right order. - lsuffix : string - The suffix to be appended to left hand column names that - are found to exist in the right frame, but are not specified - as join keys themselves. - rsuffix : string - The suffix to be appended to right hand column names that - are found to exist in the left frame, but are not specified - as join keys themselves. suffixes : list like Left and right suffixes specified together, unpacked into lsuffix and rsuffix. """ + self._validate_merge_params( + lhs, + rhs, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + suffixes=suffixes, + ) + self._joiner = functools.partial(libcudf.join.join, how=how) + self.lhs = lhs self.rhs = rhs + self.on = on + self.left_on = left_on + self.right_on = right_on self.left_index = left_index self.right_index = right_index - self.method = method - self.sort = sort - - # check that the merge is valid - - self.validate_merge_cfg( - lhs, - rhs, - on, - left_on, - right_on, - left_index, - right_index, - how, - lsuffix, - rsuffix, - suffixes, - ) self.how = how - self.preprocess_merge_params( - on, left_on, right_on, lsuffix, rsuffix, suffixes - ) - - def perform_merge(self): - """ - Call libcudf to perform a merge between the operands. If - necessary, cast the input key columns to compatible types. - Potentially also cast the output back to categorical. - """ - output_dtypes = self.compute_output_dtypes() - self.typecast_input_to_libcudf() - libcudf_result = libcudf.join.join( - self.lhs, - self.rhs, - self.how, - self.method, - left_on=self.left_on, - right_on=self.right_on, - left_index=self.left_index, - right_index=self.right_index, - ) - result = self.out_class._from_table(libcudf_result) - result = self.typecast_libcudf_to_output(result, output_dtypes) - if isinstance(result, cudf.Index): - return result - else: - return result[ - compute_result_col_names(self.lhs, self.rhs, self.how) - ] + self.sort = sort + if suffixes: + self.lsuffix, self.rsuffix = suffixes + self._compute_join_keys() - def preprocess_merge_params( - self, on, left_on, right_on, lsuffix, rsuffix, suffixes - ): - """ - Translate a valid configuration of user input parameters into - the subset of input configurations handled by the cython layer. - Apply suffixes to columns. - """ + @property + def _out_class(self): + # type of the result + out_class = cudf.DataFrame - self.out_class = cudf.DataFrame if isinstance(self.lhs, cudf.MultiIndex) or isinstance( self.rhs, cudf.MultiIndex ): - self.out_class = cudf.MultiIndex + out_class = cudf.MultiIndex elif isinstance(self.lhs, cudf.Index): - self.out_class = self.lhs.__class__ + out_class = self.lhs.__class__ + return out_class - if on: - on = [on] if isinstance(on, str) else list(on) - left_on = right_on = on - else: - if left_on: - left_on = ( - [left_on] if isinstance(left_on, str) else list(left_on) - ) - if right_on: - right_on = ( - [right_on] if isinstance(right_on, str) else list(right_on) - ) + def perform_merge(self) -> Frame: + lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs) - same_named_columns = set(self.lhs._data.keys()) & set( - self.rhs._data.keys() + left_table = _frame_select_by_indexers(lhs, self._keys.left) + right_table = _frame_select_by_indexers(rhs, self._keys.right) + + left_rows, right_rows = self._joiner( + left_table, right_table, how=self.how, ) - if not (left_on or right_on) and not ( - self.left_index and self.right_index - ): - left_on = right_on = list(same_named_columns) - - no_suffix_cols = [] - if left_on and right_on: - no_suffix_cols = [ - left_name - for left_name, right_name in zip(left_on, right_on) - if left_name == right_name and left_name in same_named_columns - ] + lhs, rhs = self._restore_categorical_keys(lhs, rhs) - if suffixes: - lsuffix, rsuffix = suffixes - for name in same_named_columns: - if name not in no_suffix_cols: - self.lhs.rename( - {name: f"{name}{lsuffix}"}, inplace=True, axis=1 + left_result = cudf.core.frame.Frame() + right_result = cudf.core.frame.Frame() + + gather_index = self.left_index or self.right_index + if left_rows is not None: + left_result = lhs._gather( + left_rows, nullify=True, keep_index=gather_index + ) + if right_rows is not None: + right_result = rhs._gather( + right_rows, nullify=True, keep_index=gather_index + ) + + result = self._merge_results(left_result, right_result) + + if self.sort: + result = self._sort_result(result) + return result + + def _compute_join_keys(self): + # Computes self._keys + if ( + self.left_index + or self.right_index + or self.left_on + or self.right_on + ): + left_keys = [] + right_keys = [] + if self.left_index: + left_keys.extend( + [ + _Indexer(name=on, index=True) + for on in self.lhs.index.names + ] ) - self.rhs.rename( - {name: f"{name}{rsuffix}"}, inplace=True, axis=1 + if self.left_on: + # TODO: require left_on or left_index to be specified + left_keys.extend( + [ + _Indexer(name=on, column=True) + for on in _coerce_to_tuple(self.left_on) + ] ) - if left_on and name in left_on: - left_on[left_on.index(name)] = f"{name}{lsuffix}" - if right_on and name in right_on: - right_on[right_on.index(name)] = f"{name}{rsuffix}" + if self.right_index: + right_keys.extend( + [ + _Indexer(name=on, index=True) + for on in self.rhs.index.names + ] + ) + if self.right_on: + # TODO: require right_on or right_index to be specified + right_keys.extend( + [ + _Indexer(name=on, column=True) + for on in _coerce_to_tuple(self.right_on) + ] + ) + else: + # Use `on` if provided. Otherwise, + # implicitly use identically named columns as the key columns: + on_names = ( + _coerce_to_tuple(self.on) + if self.on is not None + else set(self.lhs._data) & set(self.rhs._data) + ) + left_keys = [_Indexer(name=on, column=True) for on in on_names] + right_keys = [_Indexer(name=on, column=True) for on in on_names] + + if len(left_keys) != len(right_keys): + raise ValueError( + "Merge operands must have same number of join key columns" + ) + + self._keys = _JoinKeys(left=left_keys, right=right_keys) + + def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: + # Merge the Frames `left_result` and `right_result` into a single + # `Frame`, suffixing column names if necessary. + + # If two key columns have the same name, a single output column appears + # in the result. For all other join types, the key column from the rhs + # is simply dropped. For outer joins, the two key columns are combined + # by filling nulls in the left key column with corresponding values + # from the right key column: + if self.how == "outer": + for lkey, rkey in zip(*self._keys): + if lkey.name == rkey.name: + # fill nulls in lhs from values in the rhs + lkey.set( + left_result, + lkey.get(left_result).fillna(rkey.get(right_result)), + validate=False, + ) + + # Compute the result column names: + # left_names and right_names will be a mappings of input column names + # to the corresponding names in the final result. + left_names = dict(zip(left_result._data, left_result._data)) + right_names = dict(zip(right_result._data, right_result._data)) + + # For any columns from left_result and right_result that have the same + # name: + # - if they are key columns, keep only the left column + # - if they are not key columns, use suffixes to differentiate them + # in the final result + common_names = set(left_names) & set(right_names) + + if self.on: + key_columns_with_same_name = self.on + else: + key_columns_with_same_name = [ + lkey.name + for lkey, rkey in zip(*self._keys) + if ( + (lkey.index, rkey.index) == (False, False) + and lkey.name == rkey.name + ) + ] + for name in common_names: + if name not in key_columns_with_same_name: + left_names[name] = f"{name}{self.lsuffix}" + right_names[name] = f"{name}{self.rsuffix}" + else: + del right_names[name] + + # Assemble the data columns of the result: + data = left_result._data.__class__() + + for lcol in left_names: + data.set_by_label( + left_names[lcol], left_result._data[lcol], validate=False + ) + for rcol in right_names: + data.set_by_label( + right_names[rcol], right_result._data[rcol], validate=False + ) + + # Index of the result: + if self.left_index and self.right_index: + index = left_result._index + elif self.left_index: + # left_index and right_on + index = right_result._index + elif self.right_index: + # right_index and left_on + index = left_result._index + else: + index = None - self.left_on = left_on if left_on is not None else [] - self.right_on = right_on if right_on is not None else [] - self.lsuffix = lsuffix - self.rsuffix = rsuffix + # Construct result from data and index: + result = self._out_class._from_data(data=data, index=index) + + return result + + def _sort_result(self, result: Frame) -> Frame: + # Pandas sorts on the key columns in the + # same order as given in 'on'. If the indices are used as + # keys, the index will be sorted. If one index is specified, + # the key columns on the other side will be used to sort. + if self.on: + if isinstance(result, cudf.Index): + sort_order = result._get_sorted_inds() + else: + # need a list instead of a tuple here because + # _get_sorted_inds calls down to ColumnAccessor.get_by_label + # which handles lists and tuples differently + sort_order = result._get_sorted_inds( + list(_coerce_to_tuple(self.on)) + ) + return result._gather(sort_order, keep_index=False) + by = [] + if self.left_index and self.right_index: + if result._index is not None: + by.extend(result._index._data.columns) + if self.left_on: + by.extend( + [result._data[col] for col in _coerce_to_tuple(self.left_on)] + ) + if self.right_on: + by.extend( + [result._data[col] for col in _coerce_to_tuple(self.right_on)] + ) + if by: + to_sort = cudf.DataFrame._from_columns(by) + sort_order = to_sort.argsort() + result = result._gather(sort_order) + return result @staticmethod - def validate_merge_cfg( + def _validate_merge_params( lhs, rhs, on, @@ -202,14 +375,11 @@ def validate_merge_cfg( left_index, right_index, how, - lsuffix, - rsuffix, suffixes, ): """ Error for various invalid combinations of merge input parameters """ - # must actually support the requested merge type if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}: raise NotImplementedError(f"{how} merge not supported yet") @@ -227,52 +397,8 @@ def validate_merge_cfg( ): raise ValueError("Can not merge on unnamed Series") - # Keys need to be in their corresponding operands - if on: - if isinstance(on, str): - on_keys = [on] - elif isinstance(on, tuple): - on_keys = list(on) - else: - on_keys = on - for key in on_keys: - if not (key in lhs._data.keys() and key in rhs._data.keys()): - raise KeyError(f"on key {on} not in both operands") - elif left_on and right_on: - left_on_keys = ( - [left_on] if not isinstance(left_on, list) else left_on - ) - right_on_keys = ( - [right_on] if not isinstance(right_on, list) else right_on - ) - - for key in left_on_keys: - if key not in lhs._data.keys(): - raise KeyError(f'Key "{key}" not in left operand') - for key in right_on_keys: - if key not in rhs._data.keys(): - raise KeyError(f'Key "{key}" not in right operand') - - # Require same total number of columns to join on in both operands - len_left_on = 0 - len_right_on = 0 - if left_on: - len_left_on += ( - len(left_on) if pd.api.types.is_list_like(left_on) else 1 - ) - if right_on: - len_right_on += ( - len(right_on) if pd.api.types.is_list_like(right_on) else 1 - ) - if not (len_left_on + left_index * lhs._num_indices) == ( - len_right_on + right_index * rhs._num_indices - ): - raise ValueError( - "Merge operands must have same number of join key columns" - ) - # If nothing specified, must have common cols to use implicitly - same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys()) + same_named_columns = set(lhs._data) & set(rhs._data) if ( not (left_index or right_index) and not (left_on or right_on) @@ -280,8 +406,7 @@ def validate_merge_cfg( ): raise ValueError("No common columns to perform merge on") - if suffixes: - lsuffix, rsuffix = suffixes + lsuffix, rsuffix = suffixes for name in same_named_columns: if name == left_on == right_on: continue @@ -297,134 +422,59 @@ def validate_merge_cfg( "lsuffix and rsuffix are not defined" ) - def typecast_input_to_libcudf(self): - """ - Check each pair of join keys in the left and right hand - operands and apply casting rules to match their types - before passing the result to libcudf. - """ - lhs_keys, rhs_keys, lhs_cols, rhs_cols = [], [], [], [] - if self.left_index: - lhs_keys.append(self.lhs.index._data.keys()) - lhs_cols.append(self.lhs.index) - if self.right_index: - rhs_keys.append(self.rhs.index._data.keys()) - rhs_cols.append(self.rhs.index) - if self.left_on: - lhs_keys.append(self.left_on) - lhs_cols.append(self.lhs) - if self.right_on: - rhs_keys.append(self.right_on) - rhs_cols.append(self.rhs) - - for l_key_grp, r_key_grp, l_col_grp, r_col_grp in zip( - lhs_keys, rhs_keys, lhs_cols, rhs_cols - ): - for l_key, r_key in zip(l_key_grp, r_key_grp): - to_dtype = _input_to_libcudf_castrules_any( - l_col_grp._data[l_key], r_col_grp._data[r_key], self.how - ) - l_col_grp._data[l_key] = l_col_grp._data[l_key].astype( - to_dtype - ) - r_col_grp._data[r_key] = r_col_grp._data[r_key].astype( - to_dtype - ) - - def compute_output_dtypes(self): - """ - Determine what datatypes should be applied to the result - of a libcudf join, baesd on the original left and right - frames. - """ - - index_dtypes = {} - l_data_join_cols = {} - r_data_join_cols = {} - - data_dtypes = { - name: col.dtype - for name, col in itertools.chain( - self.lhs._data.items(), self.rhs._data.items() + def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: + # Match the dtypes of the key columns from lhs and rhs + out_lhs = lhs.copy(deep=False) + out_rhs = rhs.copy(deep=False) + for left_key, right_key in zip(*self._keys): + lcol, rcol = left_key.get(lhs), right_key.get(rhs) + lcol_casted, rcol_casted = _match_join_keys( + lcol, rcol, how=self.how ) - } - - if self.left_index and self.right_index: - l_idx_join_cols = list(self.lhs.index._data.values()) - r_idx_join_cols = list(self.rhs.index._data.values()) - elif self.left_on and self.right_index: - # Keep the orignal dtypes in the LEFT index if possible - # should trigger a bunch of no-ops - l_idx_join_cols = list(self.lhs.index._data.values()) - r_idx_join_cols = list(self.lhs.index._data.values()) - for i, name in enumerate(self.left_on): - l_data_join_cols[name] = self.lhs._data[name] - r_data_join_cols[name] = list(self.rhs.index._data.values())[i] - - elif self.left_index and self.right_on: - # see above - l_idx_join_cols = list(self.rhs.index._data.values()) - r_idx_join_cols = list(self.rhs.index._data.values()) - for i, name in enumerate(self.right_on): - l_data_join_cols[name] = list(self.lhs.index._data.values())[i] - r_data_join_cols[name] = self.rhs._data[name] - - if self.left_on and self.right_on: - l_data_join_cols = self.lhs._data - r_data_join_cols = self.rhs._data - - if self.left_index or self.right_index: - for i in range(len(self.lhs.index._data.items())): - index_dtypes[i] = _libcudf_to_output_castrules( - l_idx_join_cols[i], r_idx_join_cols[i], self.how - ) - - for name in itertools.chain(self.left_on, self.right_on): - if name in self.left_on and name in self.right_on: - data_dtypes[name] = _libcudf_to_output_castrules( - l_data_join_cols[name], r_data_join_cols[name], self.how - ) - return (index_dtypes, data_dtypes) + if lcol is not lcol_casted: + left_key.set(out_lhs, lcol_casted, validate=False) + if rcol is not rcol_casted: + right_key.set(out_rhs, rcol_casted, validate=False) + return out_lhs, out_rhs + + def _restore_categorical_keys( + self, lhs: Frame, rhs: Frame + ) -> Tuple[Frame, Frame]: + # For inner joins, any categorical keys in `self.lhs` and `self.rhs` + # were casted to their category type to produce `lhs` and `rhs`. + # Here, we cast them back. + out_lhs = lhs.copy(deep=False) + out_rhs = rhs.copy(deep=False) + if self.how == "inner": + for left_key, right_key in zip(*self._keys): + if isinstance( + left_key.get(self.lhs).dtype, cudf.CategoricalDtype + ) and isinstance( + right_key.get(self.rhs).dtype, cudf.CategoricalDtype + ): + left_key.set( + out_lhs, + left_key.get(out_lhs).astype("category"), + validate=False, + ) + right_key.set( + out_rhs, + right_key.get(out_rhs).astype("category"), + validate=False, + ) + return out_lhs, out_rhs - def typecast_libcudf_to_output(self, output, output_dtypes): - """ - Apply precomputed output index and data column data types - to the output of a libcudf join. - """ - index_dtypes, data_dtypes = output_dtypes - if output._index and len(index_dtypes) > 0: - for index_dtype, index_col_lbl, index_col in zip( - index_dtypes.values(), - output._index._data.keys(), - output._index._data.values(), - ): - if index_dtype: - output._index._data[ - index_col_lbl - ] = self._build_output_col(index_col, index_dtype) - # reconstruct the Index object as the underlying data types - # have changed: - output._index = cudf.core.index.Index._from_table(output._index) - - for data_col_lbl, data_col in output._data.items(): - data_dtype = data_dtypes[data_col_lbl] - if data_dtype: - output._data[data_col_lbl] = self._build_output_col( - data_col, data_dtype - ) - return output +class MergeSemi(Merge): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._joiner = functools.partial( + libcudf.join.semi_join, how=kwargs["how"] + ) - def _build_output_col(self, col, dtype): - if isinstance( - dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype) - ): - outcol = cudf.core.column.build_categorical_column( - categories=dtype.categories, - codes=col.set_mask(None), - mask=col.base_mask, - ordered=dtype.ordered, - ) + def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: + # semi-join result includes only lhs columns + if issubclass(self._out_class, cudf.Index): + return self._out_class._from_data(lhs._data) else: - outcol = col.astype(dtype) - return outcol + return self._out_class._from_data(lhs._data, index=lhs._index) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 82e89bb00f4..1c1e48e7372 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +from __future__ import annotations import itertools import numbers @@ -18,6 +19,7 @@ from cudf._typing import DataFrameOrSeries from cudf.core._compat import PANDAS_GE_120 from cudf.core.column import column +from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import Index, as_index @@ -188,6 +190,19 @@ def names(self): def names(self, value): value = [None] * self.nlevels if value is None else value assert len(value) == self.nlevels + + if len(value) == len(set(value)): + # IMPORTANT: if the provided names are unique, + # we reconstruct self._data with the names as keys. + # If they are not unique, the keys of self._data + # and self._names will be different, which can lead + # to unexpected behaviour in some cases. This is + # definitely buggy, but we can't disallow non-unique + # names either... + self._data = self._data.__class__._create_unsafe( + dict(zip(value, self._data.values())), + level_names=self._data.level_names, + ) self._names = pd.core.indexes.frozen.FrozenList(value) def rename(self, names, inplace=False): @@ -234,7 +249,6 @@ def rename(self, names, inplace=False): ValueError: Length of names must match number of levels in MultiIndex. """ - return self.set_names(names, level=None, inplace=inplace) def set_names(self, names, level=None, inplace=False): @@ -278,6 +292,10 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) + @classmethod + def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex: + return cls.from_frame(cudf.DataFrame._from_data(data)) + @classmethod def _from_table(cls, table, names=None): df = cudf.DataFrame(table._data) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a664c4fb182..71a4a48a07a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6299,17 +6299,24 @@ def merge( method="hash", suffixes=("_x", "_y"), ): - if left_on not in (self.name, None): raise ValueError( "Series to other merge uses series name as key implicitly" ) - lhs = self.copy(deep=False) - rhs = other.copy(deep=False) + if lsuffix or rsuffix: + raise ValueError( + "The lsuffix and rsuffix keywords have been replaced with the " + "``suffixes=`` keyword. " + "Please provide the following instead: \n\n" + " suffixes=('%s', '%s')" + % (lsuffix or "_x", rsuffix or "_y") + ) + else: + lsuffix, rsuffix = suffixes - result = super(Series, lhs)._merge( - rhs, + result = super()._merge( + other, on=on, left_on=left_on, right_on=right_on, @@ -6317,8 +6324,6 @@ def merge( right_index=right_index, how=how, sort=sort, - lsuffix=lsuffix, - rsuffix=rsuffix, method=method, indicator=False, suffixes=suffixes, diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 969cf1bf549..9164bfe98d1 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -14,11 +14,13 @@ assert_exceptions_equal, ) +_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") + def make_params(): np.random.seed(0) - hows = "left,inner,outer,right,leftanti,leftsemi".split(",") + hows = _JOIN_TYPES methods = "hash,sort".split(",") # Test specific cases (1) @@ -69,6 +71,37 @@ def pd_odd_joins(left, right, join_type): return left[left.index.isin(right.index)][left.columns] +def assert_join_results_equal(expect, got, how, **kwargs): + if how not in _JOIN_TYPES: + raise ValueError(f"Unrecognized join type {how}") + if how == "right": + got = got[expect.columns] + + if isinstance(expect, (pd.Series, cudf.Series)): + return assert_eq( + expect.sort_values().reset_index(drop=True), + got.sort_values().reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + if not len( + expect.columns + ): # can't sort_values() on a df without columns + return assert_eq(expect, got, **kwargs) + + return assert_eq( + expect.sort_values(expect.columns.to_list()).reset_index( + drop=True + ), + got.sort_values(got.columns.to_list()).reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.Index, cudf.Index)): + return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) + else: + raise ValueError(f"Not a join result: {type(expect).__name__}") + + @pytest.mark.parametrize("aa,bb,how,method", make_params()) def test_dataframe_join_how(aa, bb, how, method): df = cudf.DataFrame() @@ -113,12 +146,7 @@ def work_gdf(df): # TODO: What is the less hacky way? expect.index.name = "bob" got.index.name = "mary" - assert_eq( - got.sort_values(got.columns.to_list()).reset_index(drop=True), - expect.sort_values(expect.columns.to_list()).reset_index( - drop=True - ), - ) + assert_join_results_equal(expect, got, how=how) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) @@ -187,10 +215,7 @@ def test_dataframe_join_cats(): expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make an object Index after joining - assert_eq( - got.sort_values(by="b").sort_index().reset_index(drop=True), - expect.reset_index(drop=True), - ) + assert_join_results_equal(expect, got, how="inner") # Just do some rough checking here. assert list(got.columns) == ["b", "c"] @@ -264,7 +289,7 @@ def test_dataframe_join_mismatch_cats(how): expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how, check_categorical=False) @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) @@ -323,7 +348,7 @@ def test_dataframe_merge_on(on): list(pddf_joined.columns) ).reset_index(drop=True) - assert_eq(cdf_result, pdf_result, check_like=True) + assert_join_results_equal(cdf_result, pdf_result, how="left") merge_func_result_cdf = ( join_result_cudf.to_pandas() @@ -331,7 +356,7 @@ def test_dataframe_merge_on(on): .reset_index(drop=True) ) - assert_eq(merge_func_result_cdf, cdf_result, check_like=True) + assert_join_results_equal(merge_func_result_cdf, cdf_result, how="left") def test_dataframe_merge_on_unknown_column(): @@ -383,7 +408,7 @@ def test_dataframe_empty_merge(): expect = cudf.DataFrame({"a": [], "b": [], "c": []}) got = gdf1.merge(gdf2, how="left", on=["a"]) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") def test_dataframe_merge_order(): @@ -408,7 +433,7 @@ def test_dataframe_merge_order(): df2["a"] = [7, 8] df = df1.merge(df2, how="left", on=["id", "a"]) - assert_eq(gdf, df) + assert_join_results_equal(df, gdf, how="left") @pytest.mark.parametrize( @@ -550,7 +575,7 @@ def test_merge_left_index_zero(): pd_merge = left.merge(right, left_on="x", right_on="y") gd_merge = gleft.merge(gright, left_on="x", right_on="y") - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") @pytest.mark.parametrize( @@ -571,7 +596,7 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") @pytest.mark.parametrize( @@ -592,7 +617,7 @@ def test_merge_left_right_index_left_right_on_kwargs(kwargs): gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") def test_indicator(): @@ -608,9 +633,10 @@ def test_indicator(): def test_merge_suffixes(): pdf = cudf.DataFrame({"x": [1, 2, 1]}) gdf = cudf.DataFrame({"x": [1, 2, 1]}) - assert_eq( + assert_join_results_equal( gdf.merge(gdf, suffixes=("left", "right")), pdf.merge(pdf, suffixes=("left", "right")), + how="left", ) assert_exceptions_equal( @@ -628,11 +654,14 @@ def test_merge_left_on_right_on(): gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) - assert_eq(left.merge(right, on="xx"), gleft.merge(gright, on="xx")) + assert_join_results_equal( + left.merge(right, on="xx"), gleft.merge(gright, on="xx"), how="left" + ) - assert_eq( + assert_join_results_equal( left.merge(right, left_on="xx", right_on="xx"), gleft.merge(gright, left_on="xx", right_on="xx"), + how="left", ) @@ -708,7 +737,9 @@ def test_merge_sort(ons, hows): pd_merge = left.merge(right, **kwargs) # require the join keys themselves to be sorted correctly # the non-key columns will NOT match pandas ordering - assert_eq(pd_merge[kwargs["on"]], gd_merge[kwargs["on"]]) + assert_join_results_equal( + pd_merge[kwargs["on"]], gd_merge[kwargs["on"]], how="left" + ) pd_merge = pd_merge.drop(kwargs["on"], axis=1) gd_merge = gd_merge.drop(kwargs["on"], axis=1) if not pd_merge.empty: @@ -720,7 +751,7 @@ def test_merge_sort(ons, hows): drop=True ) - assert_eq(pd_merge, gd_merge) + assert_join_results_equal(pd_merge, gd_merge, how="left") @pytest.mark.parametrize( @@ -781,7 +812,7 @@ def test_join_datetimes_index(dtype): assert gdf["d"].dtype == np.dtype(dtype) - assert_eq(pdf, gdf) + assert_join_results_equal(pdf, gdf, how="inner") def test_join_with_different_names(): @@ -791,7 +822,7 @@ def test_join_with_different_names(): gright = cudf.from_pandas(right) pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"]) - assert_eq(pd_merge, gd_merge.sort_values(by=["a"]).reset_index(drop=True)) + assert_join_results_equal(pd_merge, gd_merge, how="outer") def test_join_same_name_different_order(): @@ -801,9 +832,7 @@ def test_join_same_name_different_order(): gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"]) gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"]) - assert_eq( - pd_merge, gd_merge.sort_values(by=["a_x"]).reset_index(drop=True) - ) + assert_join_results_equal(pd_merge, gd_merge, how="left") def test_join_empty_table_dtype(): @@ -874,10 +903,7 @@ def test_join_multi(how, column_a, column_b, column_c): gdf_result = gdf_result[columns] pdf_result = pdf_result[columns] - assert_eq( - gdf_result.reset_index(drop=True).fillna(-1), - pdf_result.sort_index().reset_index(drop=True).fillna(-1), - ) + assert_join_results_equal(pdf_result, gdf_result, how="inner") @pytest.mark.parametrize( @@ -967,7 +993,7 @@ def test_merge_multi(kwargs): expect.index = range(len(expect)) got.index = range(len(got)) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize("dtype_l", INTEGER_TYPES) @@ -997,7 +1023,7 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize("dtype_l", ["float32", "float64"]) @@ -1032,7 +1058,7 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize("dtype_l", NUMERIC_TYPES) @@ -1068,7 +1094,7 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def test_typecast_on_join_no_float_round(): @@ -1092,7 +1118,7 @@ def test_typecast_on_join_no_float_round(): got = gdf_l.merge(gdf_r, on="join_col", how="left") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize( @@ -1121,10 +1147,7 @@ def test_typecast_on_join_overflow_unsafe(dtypes): with pytest.warns( UserWarning, - match=( - f"can't safely cast column" - f" from right with type {dtype_r} to {dtype_l}" - ), + match=(f"Can't safely cast column" f" from {dtype_r} to {dtype_l}"), ): merged = lhs.merge(rhs, on="a", how="left") # noqa: F841 @@ -1165,7 +1188,7 @@ def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize("dtype_l", ["category", "str", "int32", "float32"]) @@ -1200,7 +1223,7 @@ def test_typecast_on_join_categorical(dtype_l, dtype_r): ) got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def make_categorical_dataframe(categories, ordered=False): @@ -1220,7 +1243,7 @@ def test_categorical_typecast_inner(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"], check_categorical=False) # Equal categories, unequal ordering -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) @@ -1238,7 +1261,7 @@ def test_categorical_typecast_inner(): expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False) expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"], check_categorical=False) # One is ordered -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) @@ -1427,20 +1450,10 @@ def test_index_join(lhs, rhs, how, level): g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index - expected = ( - p_lhs.join(p_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) - got = ( - g_lhs.join(g_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) def test_index_join_corner_cases(): @@ -1461,20 +1474,10 @@ def test_index_join_corner_cases(): p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index - expected = ( - p_lhs.join(p_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) - got = ( - g_lhs.join(g_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) # sort is supported only in case of two non-MultiIndex join # Join when column name doesn't match with level @@ -1490,7 +1493,7 @@ def test_index_join_corner_cases(): expected = p_lhs.join(p_rhs, how=how, sort=True) got = g_lhs.join(g_rhs, how=how, sort=True) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) # Pandas Index.join on categorical column returns generic column # but cudf will be returning a categorical column itself. @@ -1504,22 +1507,12 @@ def test_index_join_corner_cases(): p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index - expected = ( - p_lhs.join(p_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) - got = ( - g_lhs.join(g_rhs, level=level, how=how) - .to_frame(index=False) - .sort_values(by=lhs) - .reset_index(drop=True) - ) + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) got["a"] = got["a"].astype(expected["a"].dtype) - assert_eq(expected, got) + assert_join_results_equal(expected, got, how=how) def test_index_join_exception_cases(): @@ -1573,7 +1566,7 @@ def test_typecast_on_join_indexes(): got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def test_typecast_on_join_multiindices(): @@ -1624,7 +1617,7 @@ def test_typecast_on_join_multiindices(): expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"]) got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") def test_typecast_on_join_indexes_matching_categorical(): @@ -1651,7 +1644,7 @@ def test_typecast_on_join_indexes_matching_categorical(): expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="inner") @pytest.mark.parametrize( @@ -1703,9 +1696,10 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): expect = check_lhs.merge(check_rhs, how=how, **kwargs) got = lhs.merge(rhs, how=how, **kwargs) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how) +@pytest.mark.xfail(reason="Cannot sort values of list dtype") @pytest.mark.parametrize( "how", ["left", "inner", "right", "leftanti", "leftsemi"] ) @@ -1730,4 +1724,17 @@ def test_merge_with_lists(how): expect = pd_left.merge(pd_right, on="a") got = gd_left.merge(gd_right, on="a") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how) + + +def test_join_renamed_index(): + df = cudf.DataFrame( + {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]} + ).set_index([0, 1]) + df.index.names = ["a", "b"] # doesn't actually change df._index._data + + expect = df.to_pandas().merge( + df.to_pandas(), left_index=True, right_index=True + ) + got = df.merge(df, left_index=True, right_index=True, how="inner") + assert_join_results_equal(expect, got, how="inner") diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 8b1ad696f04..2ca6bc622be 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -17,6 +17,7 @@ from cudf.core._compat import PANDAS_GE_110 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index +from cudf.tests.test_joining import assert_join_results_equal from cudf.tests.utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -919,16 +920,12 @@ def test_string_split(data, pat, n, expand): @pytest.mark.parametrize( - "str_data,str_data_raise", - [ - ([], 0), - (["a", "b", "c", "d", "e"], 0), - ([None, None, None, None, None], 1), - ], + "str_data", + [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], ) @pytest.mark.parametrize("num_keys", [1, 2, 3]) @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_key(str_data, str_data_raise, num_keys, how): +def test_string_join_key(str_data, num_keys, how): other_data = [1, 2, 3, 4, 5][: len(str_data)] pdf = pd.DataFrame() @@ -942,19 +939,17 @@ def test_string_join_key(str_data, str_data_raise, num_keys, how): pdf2 = pdf.copy() gdf2 = gdf.copy() - expectation = raise_builder( - [0 if how == "right" else str_data_raise], (AssertionError) - ) + expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) + got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) - with expectation: - expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) - got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) + if len(expect) == 0 and len(got) == 0: + expect = expect.reset_index(drop=True) + got = got[expect.columns] # reorder columns - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] + if how == "right": + got = got[expect.columns] # reorder columns - assert_eq(expect, got) + assert_join_results_equal(expect, got, how=how) @pytest.mark.parametrize( @@ -998,7 +993,7 @@ def test_string_join_key_nulls(str_data_nulls): expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize( @@ -1027,7 +1022,10 @@ def test_string_join_non_key(str_data, num_cols, how): expect = expect.reset_index(drop=True) got = got[expect.columns] - assert_eq(expect, got) + if how == "right": + got = got[expect.columns] # reorder columns + + assert_join_results_equal(expect, got, how=how) @pytest.mark.parametrize( @@ -1068,7 +1066,7 @@ def test_string_join_non_key_nulls(str_data_nulls): expect = expect.reset_index(drop=True) got = got[expect.columns] - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") def test_string_join_values_nulls(): @@ -1108,7 +1106,7 @@ def test_string_join_values_nulls(): expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - assert_eq(expect, got) + assert_join_results_equal(expect, got, how="left") @pytest.mark.parametrize(