diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index ea2fda399fd..9c13b27ff9d 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -133,12 +133,13 @@ test: - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/ipc.hpp - test -f $PREFIX/include/cudf/join.hpp + - test -f $PREFIX/include/cudf/lists/detail/combine.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - - test -f $PREFIX/include/cudf/lists/concatenate_rows.hpp + - test -f $PREFIX/include/cudf/lists/combine.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index abad4d7bbca..af6f60b031d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -266,7 +266,8 @@ add_library(cudf src/join/join.cu src/join/semi_join.cu src/lists/contains.cu - src/lists/concatenate_rows.cu + src/lists/combine/concatenate_list_elements.cu + src/lists/combine/concatenate_rows.cu src/lists/copying/concatenate.cu src/lists/copying/copying.cu src/lists/copying/gather.cu diff --git a/cpp/include/cudf/lists/concatenate_rows.hpp b/cpp/include/cudf/lists/combine.hpp similarity index 57% rename from cpp/include/cudf/lists/concatenate_rows.hpp rename to cpp/include/cudf/lists/combine.hpp index 1d93de418f8..a9407ed57ca 100644 --- a/cpp/include/cudf/lists/concatenate_rows.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -21,7 +21,7 @@ namespace cudf { namespace lists { /** - * @addtogroup lists_concatenate_rows + * @addtogroup lists_combine * @{ * @file */ @@ -53,16 +53,47 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from - * concatenation, or any concatenation involving a null list element will result in a null list. + * concatenation, or any concatenation involving a null element will result in a null list. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in - * the corresponding row of the input table. + * the corresponding row of the input table. */ std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Concatenating multiple lists on the same row of a lists column into a single list. + * + * Given a lists column where each row in the column is a list of lists of entries, an output lists + * column is generated by concatenating all the list elements at the same row together. If any row + * contains null list elements, the concatenation process will either ignore those null elements, or + * will simply set the entire resulting row to be a null element. + * + * @code{.pseudo} + * l = [ [{1, 2}, {3, 4}, {5}], [{6}, {}, {7, 8, 9}] ] + * r = lists::concatenate_list_elements(l); + * r is [ {1, 2, 3, 4, 5}, {6, 7, 8, 9} ] + * @endcode + * + * @throws cudf::logic_error if the input column is not at least two-level depth lists column (i.e., + * each row must be a list of list). + * @throws cudf::logic_error if the input lists column contains nested typed entries that are not + * lists. + * + * @param input The lists column containing lists of list elements to concatenate. + * @param null_policy The parameter to specify whether a null list element will be ignored from + * concatenation, or any concatenation involving a null element will result in a null list. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return A new column in which each row is a list resulted from concatenating all list elements in + * the corresponding row of the input lists column. + */ +std::unique_ptr concatenate_list_elements( + column_view const& input, + concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace lists } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp new file mode 100644 index 00000000000..9f28074173a --- /dev/null +++ b/cpp/include/cudf/lists/detail/combine.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows( + table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::lists::concatenate_list_elements + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_list_elements( + column_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 11b907e7f16..dda8ce87432 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -143,7 +143,7 @@ * @} * @defgroup lists_apis Lists * @{ - * @defgroup lists_concatenate_rows Combining + * @defgroup lists_combine Combining * @defgroup lists_extract Extracting * @defgroup lists_contains Searching * @defgroup lists_gather Gathering diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu new file mode 100644 index 00000000000..b76cd19d94b --- /dev/null +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +namespace { +/** + * @brief Concatenate lists within the same row into one list, ignoring any null list during + * concatenation. + */ +std::unique_ptr concatenate_lists_ignore_null(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.size(); + + static_assert(std::is_same_v && std::is_same_v); + auto out_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + + auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + + // Concatenating the lists at the same row by converting the entry offsets from the child column + // into row offsets of the root column. Those entry offsets are subtracted by the first entry + // offset to output zero-based offsets. + auto const iter = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows + 1, + d_out_offsets, + [d_row_offsets, d_list_offsets] __device__(auto const idx) { + auto const start_offset = d_list_offsets[d_row_offsets[0]]; + return d_list_offsets[d_row_offsets[idx]] - start_offset; + }); + + // The child column of the output lists column is just copied from the input column. + auto out_entries = std::make_unique( + lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream)); + + return make_lists_column(num_rows, + std::move(out_offsets), + std::move(out_entries), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr), + stream, + mr); +} + +/** + * @brief Generate list offsets and list validities for the output lists column. + * + * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::pair, rmm::device_uvector> +generate_list_offsets_and_validities(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.size(); + + static_assert(std::is_same_v && std::is_same_v); + auto out_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + + auto const lists_of_lists_dv_ptr = column_device_view::create(input); + auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child()); + auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + + // The array of int8_t stores validities for the output list elements. + auto validities = rmm::device_uvector(num_rows, stream); + + // Compute output list sizes and validities. + auto const iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_rows, + d_out_offsets, + [lists_of_lists_dv = *lists_of_lists_dv_ptr, + lists_dv = *lists_dv_ptr, + d_row_offsets, + d_list_offsets, + d_validities = validities.begin(), + iter] __device__(auto const idx) { + if (d_row_offsets[idx] == d_row_offsets[idx + 1]) { // This is a null/empty row. + d_validities[idx] = static_cast(lists_of_lists_dv.is_valid(idx)); + return size_type{0}; + } + // The output row will not be null only if all lists on the input row are not null. + auto const is_valid = + thrust::all_of(thrust::seq, + iter + d_row_offsets[idx], + iter + d_row_offsets[idx + 1], + [&] __device__(auto const list_idx) { return lists_dv.is_valid(list_idx); }); + d_validities[idx] = static_cast(is_valid); + if (!is_valid) { return size_type{0}; } + + // Compute size of the output list as sum of sizes of all lists in the current input row. + return d_list_offsets[d_row_offsets[idx + 1]] - d_list_offsets[d_row_offsets[idx]]; + }); + + // Compute offsets from sizes. + thrust::exclusive_scan( + rmm::exec_policy(stream), d_out_offsets, d_out_offsets + num_rows + 1, d_out_offsets); + + return {std::move(out_offsets), std::move(validities)}; +} + +/** + * @brief Gather entries from the input lists column, ignoring rows that have null list elements. + * + * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::unique_ptr gather_list_entries(column_view const& input, + column_view const& output_list_offsets, + size_type num_rows, + size_type num_output_entries, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const child_col = lists_column_view(input).child(); + auto const entry_col = lists_column_view(child_col).child(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(child_col).offsets_begin(); + auto gather_map = rmm::device_uvector(num_output_entries, stream); + + // Fill the gather map with indices of the lists from the child column of the input column. + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_rows, + [d_row_offsets, + d_list_offsets, + d_indices = gather_map.begin(), + d_out_list_offsets = + output_list_offsets.template begin()] __device__(size_type const idx) { + // The output row has been identified as a null/empty list during list size computation. + if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; } + + // The indices of the list elements on the row `idx` of the input column. + thrust::sequence(thrust::seq, + d_indices + d_out_list_offsets[idx], + d_indices + d_out_list_offsets[idx + 1], + d_list_offsets[d_row_offsets[idx]]); + }); + + auto result = cudf::detail::gather(table_view{{entry_col}}, + gather_map.begin(), + gather_map.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(result->release()[0]); +} + +std::unique_ptr concatenate_lists_nullifying_rows(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Generate offsets and validities of the output lists column. + auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); + auto const offsets_view = list_offsets->view(); + + auto const num_rows = input.size(); + auto const num_output_entries = + cudf::detail::get_value(offsets_view, num_rows, stream); + + auto list_entries = + gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr); + auto [null_mask, null_count] = cudf::detail::valid_if( + list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); + + return make_lists_column(num_rows, + std::move(list_offsets), + std::move(list_entries), + null_count, + null_count ? std::move(null_mask) : rmm::device_buffer{}, + stream, + mr); +} + +} // namespace + +/** + * @copydoc cudf::lists::concatenate_list_elements + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_list_elements(column_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto type = input.type(); // Column that is lists of lists. + CUDF_EXPECTS(type.id() == type_id::LIST, "Input column must be a lists column."); + + auto col = lists_column_view(input).child(); // Rows, which are lists. + type = col.type(); + CUDF_EXPECTS(type.id() == type_id::LIST, "Rows of the input column must be lists."); + + col = lists_column_view(col).child(); // The last level entries what we need to check. + type = col.type(); + CUDF_EXPECTS(type.id() == type_id::LIST || !cudf::is_nested(type), + "Entry of the input lists column must be of list or non-nested types."); + + if (input.size() == 0) { return cudf::empty_like(input); } + + return (null_policy == concatenate_null_policy::IGNORE || + !lists_column_view(input).child().has_nulls()) + ? concatenate_lists_ignore_null(input, stream, mr) + : concatenate_lists_nullifying_rows(input, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_list_elements + */ +std::unique_ptr concatenate_list_elements(column_view const& input, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements(input, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu new file mode 100644 index 00000000000..fdd71aea7bf --- /dev/null +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows(table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); + + auto const entry_type = lists_column_view(*input.begin()).child().type(); + for (auto const& col : input) { + CUDF_EXPECTS(col.type().id() == type_id::LIST, + "All columns of the input table must be of lists column type."); + + auto const child_col = lists_column_view(col).child(); + CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); + CUDF_EXPECTS(entry_type == child_col.type(), + "The types of entries in the input columns must be the same."); + } + + auto const num_rows = input.num_rows(); + auto const num_cols = input.num_columns(); + if (num_rows == 0) { return cudf::empty_like(input.column(0)); } + if (num_cols == 1) { return std::make_unique(*(input.begin()), stream, mr); } + + // Memory resource for temporary data. + auto const default_mr = rmm::mr::get_current_device_resource(); + + // Interleave the input table into one column. + auto const has_null_mask = std::any_of( + std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); + auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream, default_mr); + + // Generate a lists column which has child column is the interleaved_columns. + // The new nested lists column will have each row is a list of `num_cols` list elements. + static_assert(std::is_same_v and std::is_same_v); + auto list_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, default_mr); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows + 1), + list_offsets->mutable_view().template begin(), + [num_cols] __device__(auto const idx) { return idx * num_cols; }); + auto const nested_lists_col = make_lists_column(num_rows, + std::move(list_offsets), + std::move(interleaved_columns), + 0, + rmm::device_buffer{}, + stream, + default_mr); + + // Concatenate lists on each row of the nested lists column, producing the desired output. + return concatenate_list_elements(nested_lists_col->view(), null_policy, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_rows + */ +std::unique_ptr concatenate_rows(table_view const& lists_columns, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/concatenate_rows.cu b/cpp/src/lists/concatenate_rows.cu deleted file mode 100644 index 8528a7680f7..00000000000 --- a/cpp/src/lists/concatenate_rows.cu +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -namespace cudf { -namespace lists { -namespace detail { -namespace { -/** - * @brief Concatenate lists within the same row into one list, ignoring any null list during - * concatenation. - */ -std::unique_ptr concatenate_rows_ignore_null(table_view const& input, - bool has_null_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_output_lists = input.num_rows(); - auto const table_dv_ptr = table_device_view::create(input); - - // Interleave the list element from the input table, thus all the lists at the same row now stay - // next to each other. - auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream); - - // Modify the list offsets to combine lists of the same input row. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); - - // The array of int8_t to store validities for list elements. - // Since we combine multiple lists, we may need to recompute list validities. - auto validities = rmm::device_uvector(has_null_mask ? num_output_lists : 0, stream); - - // For an input table of `n` columns, if after interleaving we have the list offsets are - // [ i_0, i_1, ..., i_n, i_n+1, ..., i_2n, ... ] then to concatenate them just modify the offsets - // to be [ i_0, i_n, i_2n, i_3n, ... ]. - auto const d_interleaved_offsets = lists_column_view(interleaved_columns->view()).offsets_begin(); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_output_lists + 1), - d_offsets, - [d_interleaved_offsets, - num_cols = input.num_columns(), - table_dv = *table_dv_ptr, - d_validities = validities.begin(), - has_null_mask] __device__(auto const idx) { - if (has_null_mask) { - auto const any_valid = thrust::any_of( - thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { - return list_col.is_valid(idx); - }); - d_validities[idx] = static_cast(any_valid); - } - return d_interleaved_offsets[idx * num_cols]; - }); - - auto [null_mask, null_count] = [&] { - return has_null_mask - ? cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr) - : std::make_pair(rmm::device_buffer{}, size_type{0}); - }(); - - // The child column containing list entries is taken from the `interleaved_columns` column. - auto interleaved_columns_content = interleaved_columns->release(); - - return make_lists_column( - num_output_lists, - std::move(list_offsets), - std::move(interleaved_columns_content.children[lists_column_view::child_column_index]), - null_count, - null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, - stream, - mr); -} - -/** - * @brief Generate list offsets and list validities for the output lists column from the table_view - * of the input lists columns. - * - * This function is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - */ -std::pair, rmm::device_uvector> -generate_list_offsets_and_validities(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_output_lists = input.num_rows(); - auto const table_dv_ptr = table_device_view::create(input); - - // The output offsets column. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); - - // The array of int8_t to store validities for list elements. - auto validities = rmm::device_uvector(num_output_lists, stream); - - // Compute list sizes and validities. - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_output_lists), - d_offsets, - [table_dv = *table_dv_ptr, d_validities = validities.begin()] __device__(size_type const idx) { - auto const all_valid = - thrust::all_of(thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { - return list_col.is_valid(idx); - }); - d_validities[idx] = static_cast(all_valid); - if (not all_valid) return size_type{0}; - - // Compute size of the output list as sum of sizes of input lists - return thrust::transform_reduce( - thrust::seq, - table_dv.begin(), - table_dv.end(), - [idx] __device__(auto const& lists_col) { - auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + - lists_col.offset(); - return list_offsets[idx + 1] - list_offsets[idx]; // list size - }, - size_type{0}, - thrust::plus{}); - }); - - // Compute offsets from sizes. - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + num_output_lists + 1, d_offsets); - - return {std::move(list_offsets), std::move(validities)}; -} - -/** - * @brief Compute string sizes, string validities, and concatenate string lists functor. - * - * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - * It is executed twice. In the first pass, the sizes and validities of the output strings will be - * computed. In the second pass, this will concatenate the lists of strings on the same row from the - * given input table. - */ -struct compute_string_sizes_and_concatenate_lists_fn { - table_device_view const table_dv; - - // Store list offsets of the output lists column. - offset_type const* const dst_list_offsets; - - // Store offsets of the strings. - offset_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes and validities of the output strings. - // If d_chars != nullptr: only concatenate lists of strings. - char* d_chars{nullptr}; - - // We need to set `1` or `0` for the validities of the strings in the child column. - int8_t* d_validities{nullptr}; - - __device__ void operator()(size_type const idx) - { - // The current row contain null, which has been identified during offsets computation. - if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } - - // read_idx and write_idx are indices of string elements. - size_type write_idx = dst_list_offsets[idx]; - thrust::for_each( - thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { - auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + - lists_col.offset(); - auto const& str_col = lists_col.child(lists_column_view::child_column_index); - auto const str_offsets = - str_col.child(strings_column_view::offsets_column_index).template data(); - - // The range of indices of the strings within the source list. - auto const start_str_idx = list_offsets[idx]; - auto const end_str_idx = list_offsets[idx + 1]; - - if (not d_chars) { // just compute sizes of strings within a list - for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { - d_validities[write_idx] = static_cast(str_col.is_valid(read_idx)); - d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx]; - } - } else { // just copy the entire memory region containing all strings in the list - // start_byte and end_byte are indices of character of the string elements. - auto const start_byte = str_offsets[start_str_idx]; - auto const end_byte = str_offsets[end_str_idx]; - if (start_byte < end_byte) { - auto const input_ptr = - str_col.child(strings_column_view::chars_column_index).template data() + - start_byte; - auto const output_ptr = d_chars + d_offsets[write_idx]; - thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr); - } - write_idx += end_str_idx - start_str_idx; - } - }); - } -}; - -/** - * @brief Struct used in type_dispatcher to interleave list entries of the input lists columns and - * output the results into a destination column. - * - * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - */ -struct concatenate_lists_fn { - template - std::enable_if_t, std::unique_ptr> operator()( - table_view const& input, - column_view const& output_list_offsets, - size_type num_output_lists, - size_type num_output_entries, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const noexcept - { - auto const table_dv_ptr = table_device_view::create(input); - auto const comp_fn = compute_string_sizes_and_concatenate_lists_fn{ - *table_dv_ptr, output_list_offsets.template begin()}; - - // Generate a null mask because the input table has nullable column. - auto [offsets_column, chars_column, null_mask, null_count] = - cudf::strings::detail::make_strings_children_with_null_mask( - comp_fn, num_output_lists, num_output_entries, stream, mr); - - return make_strings_column(num_output_entries, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), - stream, - mr); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - table_view const& input, - column_view const& output_list_offsets, - size_type num_output_lists, - size_type num_output_entries, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const noexcept - { - auto const table_dv_ptr = table_device_view::create(input); - - // The output child column. - auto const child_col = lists_column_view(*input.begin()).child(); - auto output = - allocate_like(child_col, num_output_entries, mask_allocation_policy::NEVER, stream, mr); - auto output_dv_ptr = mutable_column_device_view::create(*output); - - // The array of int8_t to store entry validities. - auto validities = rmm::device_uvector(num_output_entries, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_output_lists, - [num_cols = input.num_columns(), - table_dv = *table_dv_ptr, - d_validities = validities.begin(), - dst_list_offsets = output_list_offsets.template begin(), - d_output = output_dv_ptr->template begin()] __device__(size_type const idx) { - // The output row has been identified as a null list during list size computation. - if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } - - auto write_start = dst_list_offsets[idx]; - thrust::for_each( - thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { - auto const list_offsets = lists_col.child(lists_column_view::offsets_column_index) - .template data() + - lists_col.offset(); - auto const& data_col = lists_col.child(lists_column_view::child_column_index); - - // The range of indices of the entries within the source list. - auto const start_idx = list_offsets[idx]; - auto const end_idx = list_offsets[idx + 1]; - - // Fill the validities array. - for (auto read_idx = start_idx, write_idx = write_start; read_idx < end_idx; - ++read_idx, ++write_idx) { - d_validities[write_idx] = static_cast(data_col.is_valid(read_idx)); - } - // Do a copy for the entire list entries. - auto const input_ptr = - reinterpret_cast(data_col.template data() + start_idx); - auto const output_ptr = reinterpret_cast(&d_output[write_start]); - thrust::copy( - thrust::seq, input_ptr, input_ptr + sizeof(T) * (end_idx - start_idx), output_ptr); - write_start += end_idx - start_idx; - }); - }); - - auto [null_mask, null_count] = cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(null_mask, null_count); } - - return output; - } - - template - std::enable_if_t and not cudf::is_fixed_width(), - std::unique_ptr> - operator()(table_view const&, - column_view const&, - size_type, - size_type, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const - { - // Currently, only support string_view and fixed-width types - CUDF_FAIL("Called `concatenate_lists_fn()` on non-supported types."); - } -}; - -std::unique_ptr concatenate_with_nullifying_rows(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Generate offsets of the output lists column. - auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); - auto const offsets_view = list_offsets->view(); - - // Copy entries from the input lists columns to the output lists column - this needed to be - // specialized for different types. - auto const num_output_lists = input.num_rows(); - auto const num_output_entries = - cudf::detail::get_value(offsets_view, num_output_lists, stream); - auto list_entries = - type_dispatcher(lists_column_view(*input.begin()).child().type(), - concatenate_lists_fn{}, - input, - offsets_view, - num_output_lists, - num_output_entries, - stream, - mr); - - auto [null_mask, null_count] = cudf::detail::valid_if( - list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); - return make_lists_column(num_output_lists, - std::move(list_offsets), - std::move(list_entries), - null_count, - null_count ? std::move(null_mask) : rmm::device_buffer{}, - stream, - mr); -} - -} // namespace - -/** - * @copydoc cudf::lists::concatenate_rows - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr concatenate_rows(table_view const& input, - concatenate_null_policy null_policy, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); - - auto const entry_type = lists_column_view(*input.begin()).child().type(); - for (auto const& col : input) { - CUDF_EXPECTS(col.type().id() == type_id::LIST, - "All columns of the input table must be of lists column type."); - - auto const child_col = lists_column_view(col).child(); - CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); - CUDF_EXPECTS(entry_type == child_col.type(), - "The types of entries in the input columns must be the same."); - } - - if (input.num_rows() == 0) { return cudf::empty_like(input.column(0)); } - if (input.num_columns() == 1) { return std::make_unique(*(input.begin()), stream, mr); } - - // List concatenation can be implemented by simply interleaving the lists columns, then modify the - // list offsets. - auto const has_null_mask = std::any_of( - std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); - if (not has_null_mask or null_policy == concatenate_null_policy::IGNORE) { - return concatenate_rows_ignore_null(input, has_null_mask, stream, mr); - } - - // Both conditions satisfied: has_null_mask == true and - // null_policy == NULLIFY_OUTPUT_ROW. - return concatenate_with_nullifying_rows(input, stream, mr); -} - -} // namespace detail - -/** - * @copydoc cudf::lists::concatenate_rows - */ -std::unique_ptr concatenate_rows(table_view const& lists_columns, - concatenate_null_policy null_policy, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); -} - -} // namespace lists -} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d87b4b81bdc..f36ec70479b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -407,7 +407,8 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) ################################################################################################### # - lists tests ---------------------------------------------------------------------------------- ConfigureTest(LISTS_TEST - lists/concatenate_rows_tests.cpp + lists/combine/concatenate_list_elements_tests.cpp + lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp lists/count_elements_tests.cpp lists/drop_list_duplicates_tests.cpp diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp new file mode 100644 index 00000000000..de6307471a9 --- /dev/null +++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace { +using StrListsCol = cudf::test::lists_column_wrapper; +using IntListsCol = cudf::test::lists_column_wrapper; +using IntCol = cudf::test::fixed_width_column_wrapper; + +constexpr bool print_all{false}; // For debugging +constexpr int32_t null{0}; + +template +auto build_lists_col(T& list, Ts&... lists) +{ + return T(std::initializer_list{std::move(list), std::move(lists)...}); +} + +auto all_nulls() { return cudf::test::iterator_all_nulls(); } + +auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); } + +auto null_at(std::vector const& indices) +{ + return cudf::test::iterator_with_null_at(cudf::host_span{indices}); +} + +} // namespace + +struct ConcatenateListElementsTest : public cudf::test::BaseFixture { +}; + +TEST_F(ConcatenateListElementsTest, InvalidInput) +{ + // Input lists is not a 2-level depth lists column. + { + auto const col = IntCol{}; + EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error); + } + + // Input lists is not at least 2-level depth lists column. + { + auto const col = IntListsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error); + } +} + +template +struct ConcatenateListElementsTypedTest : public cudf::test::BaseFixture { +}; + +using TypesForTest = cudf::test::Concat; +TYPED_TEST_CASE(ConcatenateListElementsTypedTest, TypesForTest); + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row0 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row1 = ListsCol{ListsCol{}}; + auto row2 = ListsCol{{7, 8}, {9, 10}}; + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{1, 2, 3, 4, 5, 6}, {}, {7, 8, 9, 10}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row01 = ListsCol{ListsCol{}}; + auto row02 = ListsCol{{7, 8}, {9, 10}}; + auto row0 = build_lists_col(row00, row01, row02); + + auto row10 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row11 = ListsCol{ListsCol{}}; + auto row12 = ListsCol{{7, 8}, {9, 10}}; + auto row1 = build_lists_col(row10, row11, row12); + + auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row21 = ListsCol{ListsCol{}}; + auto row22 = ListsCol{{7, 8}, {9, 10}}; + auto row2 = build_lists_col(row20, row21, row22); + + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnNoNull) +{ + auto row0 = StrListsCol{StrListsCol{"Tomato", "Apple"}, StrListsCol{"Orange"}}; + auto row1 = StrListsCol{StrListsCol{"Banana", "Kiwi", "Cherry"}, StrListsCol{"Lemon", "Peach"}}; + auto row2 = StrListsCol{StrListsCol{"Coconut"}, StrListsCol{}}; + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{"Tomato", "Apple", "Orange"}, + StrListsCol{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + auto row0 = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)}, + ListsCol{{10, 11, 12, null}, null_at(3)}, + ListsCol{} /*NULL*/}, + null_at(2)}; + auto row1 = ListsCol{ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)}, + ListsCol{{20, null}, null_at(1)}}; + auto row2 = ListsCol{{ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{} /*NULL*/, + ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}}, + null_at(1)}; + auto row3 = ListsCol{{ListsCol{} /*NULL*/, ListsCol{{null, 18}, null_at(0)}}, null_at(0)}; + auto row4 = ListsCol{ListsCol{{1, 2, null, 4}, null_at(2)}, + ListsCol{{19, 20, null}, null_at(2)}, + ListsCol{22, 23, 24, 25}}; + auto row5 = ListsCol{ListsCol{{1, 2, 3, null}, null_at(3)}, + ListsCol{{null}, null_at(0)}, + ListsCol{{null, null, null, null, null}, all_nulls()}}; + auto const col = build_lists_col(row0, row1, row2, row3, row4, row5); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{} /*NULL*/, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{} /*NULL*/, + ListsCol{} /*NULL*/, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}}, + null_at({0, 2, 3})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row01 = ListsCol{ListsCol{}}; /*NULL*/ + auto row02 = ListsCol{{7, 8}, {9, 10}}; + auto row0 = ListsCol{{std::move(row00), std::move(row01), std::move(row02)}, null_at(1)}; + + auto row10 = ListsCol{{{1, 2}, {3}, {4, 5, 6} /*NULL*/}, null_at(2)}; + auto row11 = ListsCol{ListsCol{}}; + auto row12 = ListsCol{{7, 8}, {9, 10}}; + auto row1 = build_lists_col(row10, row11, row12); + + auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row21 = ListsCol{ListsCol{}}; + auto row22 = ListsCol{ListsCol{{null, 8}, null_at(0)}, {9, 10}}; + auto row2 = build_lists_col(row20, row21, row22); + + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {7, 8}, {9, 10}}, + ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{ListsCol{}}, /*NULL*/ + ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}}, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithNulls) +{ + auto row0 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Lemon", "Peach"}}; + auto row2 = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)}; + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{ + StrListsCol{{"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" + /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{ + {StrListsCol{ + {"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithEmptyStringsAndNulls) +{ + auto row0 = + StrListsCol{StrListsCol{"", "", ""}, + StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})}, + StrListsCol{""}}; + auto row2 = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)}; + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{ + StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{ + {StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col_original = ListsCol{ListsCol{{1, 2, 3}, {2, 3}}, + ListsCol{{3, 4, 5, 6}, {5, 6}, {}, {7}}, + ListsCol{{7, 7, 7}, {7, 8, 1, 0}, {1}}, + ListsCol{{9, 10, 11}}, + ListsCol{}, + ListsCol{{12, 13, 14, 15}, {16}, {17}}}; + + { + auto const col = cudf::slice(col_original, {0, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{{1, 2, 3, 2, 3}, {3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}, {}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {3, 6})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{9, 10, 11}, {}, {12, 13, 14, 15, 16, 17}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row0 = ListsCol{ListsCol{{null, 2, 3}, null_at(0)}, ListsCol{2, 3}}; + auto row1 = ListsCol{ListsCol{{3, null, null, 6}, null_at({1, 2})}, + ListsCol{{5, 6, null}, null_at(2)}, + ListsCol{}, + ListsCol{{7, null}, null_at(1)}}; + auto row2 = ListsCol{ListsCol{7, 7, 7}, ListsCol{{7, 8, null, 0}, null_at(2)}, ListsCol{1}}; + auto row3 = ListsCol{ListsCol{9, 10, 11}}; + auto row4 = ListsCol{ListsCol{}}; + auto row5 = ListsCol{ListsCol{{12, null, 14, 15}, null_at(1)}, ListsCol{16}, ListsCol{17}}; + auto const col_original = build_lists_col(row0, row1, row2, row3, row4, row5); + + { + auto const col = cudf::slice(col_original, {0, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{null, 2, 3, 2, 3}, null_at(0)}, + ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})}, + ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})}, + ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, + ListsCol{9, 10, 11}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, ListsCol{9, 10, 11}, ListsCol{}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {3, 6})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{9, 10, 11}, ListsCol{}, ListsCol{{12, null, 14, 15, 16, 17}, null_at(1)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(ConcatenateListElementsTest, SlicedStringsColumnsInputWithNulls) +{ + auto row0 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}}; + auto row2 = StrListsCol{ + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}}; + auto row3 = StrListsCol{ + {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + auto const col_original = build_lists_col(row0, row1, row2, row3); + + { + auto const col = cudf::slice(col_original, {0, 2})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Tomato", + "" /*NULL*/, + "Apple", + "Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut"}, + null_at({1, 4, 7})}, + StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 8, 9})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 8, 9})}, + StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}, + StrListsCol{{"Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({1, 2, 3})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{{StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}, + StrListsCol{} /*NULL*/}, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} diff --git a/cpp/tests/lists/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp similarity index 99% rename from cpp/tests/lists/concatenate_rows_tests.cpp rename to cpp/tests/lists/combine/concatenate_rows_tests.cpp index 5abaf99f739..3e085af7740 100644 --- a/cpp/tests/lists/concatenate_rows_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace { using StrListsCol = cudf::test::lists_column_wrapper; diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 85bbdd41b4a..a09de5c61e3 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd similarity index 83% rename from python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd rename to python/cudf/cudf/_lib/cpp/lists/combine.pxd index 8c4dabf5168..ea9ade178e2 100644 --- a/python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd @@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view -cdef extern from "cudf/lists/concatenate_rows.hpp" namespace \ +cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: cdef unique_ptr[column] concatenate_rows( const table_view input_table diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 46f034dc525..7d8909610dc 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -16,7 +16,7 @@ from cudf._lib.cpp.lists.drop_list_duplicates cimport ( from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) -from cudf._lib.cpp.lists.concatenate_rows cimport ( +from cudf._lib.cpp.lists.combine cimport ( concatenate_rows as cpp_concatenate_rows ) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view