diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index e709824721c..d4352937635 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -135,6 +135,7 @@ test: - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp + - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/contains.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2a51ad5e55a..cb364518922 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -256,6 +256,7 @@ add_library(cudf src/lists/copying/gather.cu src/lists/copying/segmented_gather.cu src/lists/count_elements.cu + src/lists/explode.cu src/lists/extract.cu src/lists/drop_list_duplicates.cu src/lists/lists_column_factories.cu @@ -285,7 +286,6 @@ add_library(cudf src/replace/nulls.cu src/replace/replace.cu src/reshape/byte_cast.cu - src/reshape/explode.cu src/reshape/interleave_columns.cu src/reshape/tile.cu src/rolling/grouped_rolling.cu diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp new file mode 100644 index 00000000000..156d4b9275d --- /dev/null +++ b/cpp/include/cudf/lists/explode.hpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cudf { + +/** + * @brief Explodes a list column's elements. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. Example: + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300], + * ``` + * + * Nulls and empty lists propagate in different ways depending on what is null or empty. + *``` + * [[5,null,15], 100], + * [null, 200], + * [[], 300], + * returns + * [5, 100], + * [null, 100], + * [15, 100], + * ``` + * Note that null lists are not included in the resulting table, but nulls inside + * lists and empty lists will be represented with a null entry for that column in that row. + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with explode_col exploded. + */ +std::unique_ptr explode( + table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Explodes a list column's elements and includes a position column. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. A position + * column is added that has the index inside the original list for each row. Example: + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [0, 5, 100], + * [1, 10, 100], + * [2, 15, 100], + * [0, 20, 200], + * [1, 25, 200], + * [0, 30, 300], + * ``` + * + * Nulls and empty lists propagate in different ways depending on what is null or empty. + *``` + * [[5,null,15], 100], + * [null, 200], + * [[], 300], + * returns + * [0, 5, 100], + * [1, null, 100], + * [2, 15, 100], + * ``` + * Note that null lists are not included in the resulting table, but nulls inside + * lists and empty lists will be represented with a null entry for that column in that row. + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with exploded value and position. The column order of return table is + * [cols before explode_input, explode_position, explode_value, cols after explode_input]. + */ +std::unique_ptr
explode_position( + table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Explodes a list column's elements retaining any null entries or empty lists inside. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. Example: + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300], + * ``` + * + * Nulls and empty lists propagate as null entries in the result. + *``` + * [[5,null,15], 100], + * [null, 200], + * [[], 300], + * returns + * [5, 100], + * [null, 100], + * [15, 100], + * [null, 200], + * [null, 300], + * ``` + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with explode_col exploded. + */ +std::unique_ptr
explode_outer( + table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a + *position column. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. A position + * column is added that has the index inside the original list for each row. Example: + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [0, 5, 100], + * [1, 10, 100], + * [2, 15, 100], + * [0, 20, 200], + * [1, 25, 200], + * [0, 30, 300], + * ``` + * + * Nulls and empty lists propagate as null entries in the result. + *``` + * [[5,null,15], 100], + * [null, 200], + * [[], 300], + * returns + * [0, 5, 100], + * [1, null, 100], + * [2, 15, 100], + * [0, null, 200], + * [0, null, 300], + * ``` + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with explode_col exploded. + */ +std::unique_ptr
explode_outer_position( + table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group + +} // namespace cudf diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index a6030f31e6d..74e4ebb8d05 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -97,92 +97,6 @@ std::unique_ptr byte_cast( flip_endianness endian_configuration, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Explodes a list column's elements. - * - * Any list is exploded, which means the elements of the list in each row are expanded into new rows - * in the output. The corresponding rows for other columns in the input are duplicated. Example: - * ``` - * [[5,10,15], 100], - * [[20,25], 200], - * [[30], 300], - * returns - * [5, 100], - * [10, 100], - * [15, 100], - * [20, 200], - * [25, 200], - * [30, 300], - * ``` - * - * Nulls and empty lists propagate in different ways depending on what is null or empty. - *``` - * [[5,null,15], 100], - * [null, 200], - * [[], 300], - * returns - * [5, 100], - * [null, 100], - * [15, 100], - * ``` - * Note that null lists are not included in the resulting table, but nulls inside - * lists and empty lists will be represented with a null entry for that column in that row. - * - * @param input_table Table to explode. - * @param explode_column_idx Column index to explode inside the table. - * @param mr Device memory resource used to allocate the returned column's device memory. - * - * @return A new table with explode_col exploded. - */ -std::unique_ptr
explode( - table_view const& input_table, - size_type explode_column_idx, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Explodes a list column's elements and includes a position column. - * - * Any list is exploded, which means the elements of the list in each row are expanded into new rows - * in the output. The corresponding rows for other columns in the input are duplicated. A position - * column is added that has the index inside the original list for each row. Example: - * ``` - * [[5,10,15], 100], - * [[20,25], 200], - * [[30], 300], - * returns - * [0, 5, 100], - * [1, 10, 100], - * [2, 15, 100], - * [0, 20, 200], - * [1, 25, 200], - * [0, 30, 300], - * ``` - * - * Nulls and empty lists propagate in different ways depending on what is null or empty. - *``` - * [[5,null,15], 100], - * [null, 200], - * [[], 300], - * returns - * [0, 5, 100], - * [1, null, 100], - * [2, 15, 100], - * ``` - * Note that null lists are not included in the resulting table, but nulls inside - * lists and empty lists will be represented with a null entry for that column in that row. - * - * @param input_table Table to explode. - * @param explode_column_idx Column index to explode inside the table. - * @param mr Device memory resource used to allocate the returned column's device memory. - * - * @return A new table with exploded value and position. The column order of return table is - * [cols before explode_input, explode_position, explode_value, cols after explode_input]. - */ -std::unique_ptr
explode_position( - table_view const& input_table, - size_type explode_column_idx, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 553cf5e9096..4571362076c 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -110,6 +110,27 @@ class table { */ std::vector> release(); + /** + * @brief Returns a table_view built from a range of column indices. + * + * @throws std::out_of_range + * If any index is outside [0, num_columns()) + * + * @param begin Beginning of the range + * @param end Ending of the range + * @return A table_view consisting of columns from the original table + * specified by the elements of `column_indices` + */ + + template + table_view select(InputIterator begin, InputIterator end) const + { + std::vector columns(std::distance(begin, end)); + std::transform( + begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); }); + return table_view(columns); + } + /** * @brief Returns a table_view with set of specified columns. * @@ -120,7 +141,10 @@ class table { * @return A table_view consisting of columns from the original table * specified by the elements of `column_indices` */ - table_view select(std::vector const& column_indices) const; + table_view select(std::vector const& column_indices) const + { + return select(column_indices.begin(), column_indices.end()); + }; /** * @brief Returns a reference to the specified column diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 22f2073f73c..083366cc310 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -174,6 +174,25 @@ class table_view : public detail::table_view_base { */ table_view(std::vector const& views); + /** + * @brief Returns a table_view built from a range of column indices. + * + * @throws std::out_of_range + * If any index is outside [0, num_columns()) + * + * @param begin Beginning of the range + * @param end Ending of the range + * @return A table_view consisting of columns from the original table + * specified by the elements of `column_indices` + */ + template + table_view select(InputIterator begin, InputIterator end) const + { + std::vector columns(std::distance(begin, end)); + std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); }); + return table_view(columns); + } + /** * @brief Returns a table_view with set of specified columns. * diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu new file mode 100644 index 00000000000..336aabde15e --- /dev/null +++ b/cpp/src/lists/explode.cu @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace detail { +namespace { + +std::unique_ptr
build_table( + table_view const& input_table, + size_type const explode_column_idx, + column_view const& sliced_child, + cudf::device_span gather_map, + thrust::optional> explode_col_gather_map, + thrust::optional> position_array, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto select_iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; }); + + auto gathered_table = + detail::gather(input_table.select(select_iter, select_iter + input_table.num_columns() - 1), + gather_map.begin(), + gather_map.end(), + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr); + + std::vector> columns = gathered_table.release()->release(); + + columns.insert(columns.begin() + explode_column_idx, + explode_col_gather_map + ? std::move(detail::gather(table_view({sliced_child}), + explode_col_gather_map->begin(), + explode_col_gather_map->end(), + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release()[0]) + : std::make_unique(sliced_child, stream, mr)); + + if (position_array) { + size_type position_size = position_array->size(); + columns.insert(columns.begin() + explode_column_idx, + std::make_unique( + data_type(type_to_id()), position_size, position_array->release())); + } + + return std::make_unique
(std::move(columns)); +} +} // namespace + +std::unique_ptr
explode(table_view const& input_table, + size_type const explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + lists_column_view explode_col{input_table.column(explode_column_idx)}; + auto sliced_child = explode_col.get_sliced_child(stream); + rmm::device_uvector gather_map(sliced_child.size(), stream); + + // Sliced columns may require rebasing of the offsets. + auto offsets = explode_col.offsets_begin(); + // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later. + auto offsets_minus_one = thrust::make_transform_iterator( + thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); + + // This looks like an off-by-one bug, but what is going on here is that we need to reduce each + // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by + // skipping the first entry and using the result of `lower_bound` directly. + thrust::lower_bound(rmm::exec_policy(stream), + offsets_minus_one, + offsets_minus_one + explode_col.size(), + counting_iter, + counting_iter + gather_map.size(), + gather_map.begin()); + + return build_table(input_table, + explode_column_idx, + sliced_child, + gather_map, + thrust::nullopt, + thrust::nullopt, + stream, + mr); +} + +std::unique_ptr
explode_position(table_view const& input_table, + size_type const explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + lists_column_view explode_col{input_table.column(explode_column_idx)}; + auto sliced_child = explode_col.get_sliced_child(stream); + rmm::device_uvector gather_map(sliced_child.size(), stream); + + // Sliced columns may require rebasing of the offsets. + auto offsets = explode_col.offsets_begin(); + // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later. + auto offsets_minus_one = thrust::make_transform_iterator( + offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); + + rmm::device_uvector pos(sliced_child.size(), stream, mr); + + // This looks like an off-by-one bug, but what is going on here is that we need to reduce each + // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by + // skipping the first entry and using the result of `lower_bound` directly. + thrust::transform( + rmm::exec_policy(stream), + counting_iter, + counting_iter + gather_map.size(), + gather_map.begin(), + [position_array = pos.data(), + offsets_minus_one, + offsets, + offset_size = explode_col.size()] __device__(auto idx) -> size_type { + auto lb_idx = thrust::distance( + offsets_minus_one, + thrust::lower_bound(thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx)); + position_array[idx] = idx - (offsets[lb_idx] - offsets[0]); + return lb_idx; + }); + + return build_table(input_table, + explode_column_idx, + sliced_child, + gather_map, + thrust::nullopt, + std::move(pos), + stream, + mr); +} + +std::unique_ptr
explode_outer(table_view const& input_table, + size_type const explode_column_idx, + bool include_position, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + lists_column_view explode_col{input_table.column(explode_column_idx)}; + auto sliced_child = explode_col.get_sliced_child(stream); + auto counting_iter = thrust::make_counting_iterator(0); + auto offsets = explode_col.offsets_begin(); + + // number of nulls or empty lists found so far in the explode column + rmm::device_uvector null_or_empty_offset(explode_col.size(), stream); + + auto null_or_empty = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [offsets, offsets_size = explode_col.size() - 1] __device__(int idx) { + return (idx > offsets_size || (offsets[idx + 1] != offsets[idx])) ? 0 : 1; + }); + thrust::inclusive_scan(rmm::exec_policy(stream), + null_or_empty, + null_or_empty + sliced_child.size(), + null_or_empty_offset.begin()); + + auto null_or_empty_count = + null_or_empty_offset.size() > 0 ? null_or_empty_offset.back_element(stream) : 0; + if (null_or_empty_count == 0) { + // performance penalty to run the below loop if there are no nulls or empty lists. + // run simple explode instead + return include_position ? explode_position(input_table, explode_column_idx, stream, mr) + : explode(input_table, explode_column_idx, stream, mr); + } + + auto gather_map_size = sliced_child.size() + null_or_empty_count; + + rmm::device_uvector gather_map(gather_map_size, stream); + rmm::device_uvector explode_col_gather_map(gather_map_size, stream); + rmm::device_uvector pos(include_position ? gather_map_size : 0, stream, mr); + + // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later. + auto offsets_minus_one = thrust::make_transform_iterator( + thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); + // Fill in gather map with all the child column's entries + thrust::for_each(rmm::exec_policy(stream), + counting_iter, + counting_iter + sliced_child.size(), + [offsets_minus_one, + gather_map = gather_map.begin(), + explode_col_gather_map = explode_col_gather_map.begin(), + position_array = pos.begin(), + include_position, + offsets, + null_or_empty_offset = null_or_empty_offset.begin(), + null_or_empty, + offset_size = explode_col.offsets().size() - 1] __device__(auto idx) { + auto lb_idx = thrust::distance( + offsets_minus_one, + thrust::lower_bound( + thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx)); + auto index_to_write = null_or_empty_offset[lb_idx] + idx; + gather_map[index_to_write] = lb_idx; + explode_col_gather_map[index_to_write] = idx; + if (include_position) { + position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]); + } + if (null_or_empty[idx]) { + auto invalid_index = null_or_empty_offset[idx] == 0 + ? offsets[idx] + : offsets[idx] + null_or_empty_offset[idx] - 1; + gather_map[invalid_index] = idx; + + // negative one to indicate a null value + explode_col_gather_map[invalid_index] = -1; + + if (include_position) { position_array[invalid_index] = 0; } + } + }); + + return build_table( + input_table, + explode_column_idx, + sliced_child, + gather_map, + explode_col_gather_map, + include_position ? std::move(pos) : thrust::optional>{}, + stream, + mr); +} + +} // namespace detail + +/** + * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode(table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, + "Unsupported non-list column"); + return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode_position(table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, + "Unsupported non-list column"); + return detail::explode_position(input_table, explode_column_idx, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::explode_outer(input_table,explode_column_idx,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode_outer(table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, + "Unsupported non-list column"); + return detail::explode_outer( + input_table, explode_column_idx, false, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc + * cudf::explode_outer_position(input_table,explode_column_idx,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode_outer_position(table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, + "Unsupported non-list column"); + return detail::explode_outer(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu deleted file mode 100644 index 34d7d8fe31d..00000000000 --- a/cpp/src/reshape/explode.cu +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -namespace cudf { -namespace detail { -namespace { -/** - * @brief Function object for exploding a column. - */ -struct explode_functor { - /** - * @brief Function object for exploding a column. - */ - template - std::unique_ptr
operator()(table_view const& input_table, - size_type const explode_column_idx, - bool include_pos, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - CUDF_FAIL("Unsupported non-list column"); - - return std::make_unique
(); - } -}; - -template <> -std::unique_ptr
explode_functor::operator()( - table_view const& input_table, - size_type const explode_column_idx, - bool include_pos, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - lists_column_view lc{input_table.column(explode_column_idx)}; - auto sliced_child = lc.get_sliced_child(stream); - rmm::device_uvector gather_map_indices(sliced_child.size(), stream); - - // Sliced columns may require rebasing of the offsets. - auto offsets = lc.offsets_begin(); - // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later. - auto offsets_minus_one = thrust::make_transform_iterator( - offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); - auto counting_iter = thrust::make_counting_iterator(0); - - rmm::device_uvector pos(include_pos ? sliced_child.size() : 0, stream, mr); - - // This looks like an off-by-one bug, but what is going on here is that we need to reduce each - // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by - // skipping the first entry and using the result of `lower_bound` directly. - if (include_pos) { - thrust::transform( - rmm::exec_policy(stream), - counting_iter, - counting_iter + gather_map_indices.size(), - gather_map_indices.begin(), - [position_array = pos.data(), offsets_minus_one, offsets, offset_size = lc.size()] __device__( - auto idx) -> size_type { - auto lb_idx = thrust::lower_bound( - thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx) - - offsets_minus_one; - position_array[idx] = idx - (offsets[lb_idx] - offsets[0]); - return lb_idx; - }); - } else { - thrust::lower_bound(rmm::exec_policy(stream), - offsets_minus_one, - offsets_minus_one + lc.size(), - counting_iter, - counting_iter + gather_map_indices.size(), - gather_map_indices.begin()); - } - - auto select_iter = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; }); - std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); - - auto gathered_table = cudf::detail::gather(input_table.select(selected_columns), - gather_map_indices.begin(), - gather_map_indices.end(), - cudf::out_of_bounds_policy::DONT_CHECK, - stream, - mr); - - std::vector> columns = gathered_table.release()->release(); - - columns.insert(columns.begin() + explode_column_idx, - std::make_unique(sliced_child, stream, mr)); - - if (include_pos) { - columns.insert(columns.begin() + explode_column_idx, - std::make_unique( - data_type(type_to_id()), sliced_child.size(), pos.release())); - } - - return std::make_unique
(std::move(columns)); -} -} // namespace - -/** - * @copydoc - * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr
explode(table_view const& input_table, - size_type explode_column_idx, - bool include_pos, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return type_dispatcher(input_table.column(explode_column_idx).type(), - explode_functor{}, - input_table, - explode_column_idx, - include_pos, - stream, - mr); -} - -} // namespace detail - -/** - * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) - */ -std::unique_ptr
explode(table_view const& input_table, - size_type explode_column_idx, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::explode(input_table, explode_column_idx, false, rmm::cuda_stream_default, mr); -} - -/** - * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource) - */ -std::unique_ptr
explode_position(table_view const& input_table, - size_type explode_column_idx, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::explode(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr); -} - -} // namespace cudf diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp index afda6313254..4cd85fc5e7e 100644 --- a/cpp/src/table/table.cpp +++ b/cpp/src/table/table.cpp @@ -81,12 +81,4 @@ std::vector> table::release() return std::move(_columns); } -// Returns a table_view with set of specified columns -table_view table::select(std::vector const& column_indices) const -{ - std::vector columns; - for (auto index : column_indices) { columns.push_back(_columns.at(index)->view()); } - return table_view(columns); -} - } // namespace cudf diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 9c421f6fd36..c64bf5b2823 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -63,11 +63,7 @@ template class table_view_base; // Returns a table_view with set of specified columns table_view table_view::select(std::vector const& column_indices) const { - std::vector columns(column_indices.size()); - std::transform(column_indices.begin(), column_indices.end(), columns.begin(), [this](auto index) { - return this->column(index); - }); - return table_view(columns); + return select(column_indices.begin(), column_indices.end()); } // Convert mutable view to immutable view diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 40829c74957..83e9c60f15b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -319,7 +319,6 @@ ConfigureTest(SEARCH_TEST search/search_test.cpp) # - reshape test ---------------------------------------------------------------------------------- ConfigureTest(RESHAPE_TEST reshape/byte_cast_tests.cpp - reshape/explode_tests.cpp reshape/interleave_columns_tests.cpp reshape/tile_tests.cpp) @@ -423,6 +422,7 @@ ConfigureTest(LISTS_TEST lists/contains_tests.cpp lists/count_elements_tests.cpp lists/drop_list_duplicates_tests.cpp + lists/explode_tests.cpp lists/extract_tests.cpp lists/sort_lists_tests.cpp) diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp new file mode 100644 index 00000000000..2ec9294d118 --- /dev/null +++ b/cpp/tests/lists/explode_tests.cpp @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +using namespace cudf::test; +using FCW = fixed_width_column_wrapper; +using LCW = lists_column_wrapper; + +class ExplodeTest : public cudf::test::BaseFixture { +}; + +class ExplodeOuterTest : public cudf::test::BaseFixture { +}; + +template +class ExplodeTypedTest : public cudf::test::BaseFixture { +}; + +template +class ExplodeOuterTypedTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes); + +TYPED_TEST_CASE(ExplodeOuterTypedTest, cudf::test::FixedPointTypes); + +TEST_F(ExplodeTest, Empty) +{ + cudf::table_view t({LCW{}, FCW{}}); + + auto ret = cudf::explode(t, 0); + + cudf::table_view expected({FCW{}, FCW{}}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + auto pos_ret = cudf::explode_position(t, 0); + + cudf::table_view pos_expected({FCW{}, FCW{}, FCW{}}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, NonList) +{ + cudf::table_view t({FCW{100, 200, 300}, FCW{100, 200, 300}}); + + EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error); + EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error); +} + +TEST_F(ExplodeTest, Basics) +{ + // a b c + // 100 [1, 2, 7] string0 + // 200 [5, 6] string1 + // 300 [0, 3] string2 + + FCW a{100, 200, 300}; + LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}; + strings_column_wrapper c{"string0", "string1", "string2"}; + + FCW expected_a{100, 100, 100, 200, 200, 300, 300}; + FCW expected_b{1, 2, 7, 5, 6, 0, 3}; + strings_column_wrapper expected_c{ + "string0", "string0", "string0", "string1", "string1", "string2", "string2"}; + + cudf::table_view t({a, b, c}); + cudf::table_view expected({expected_a, expected_b, expected_c}); + + auto ret = cudf::explode(t, 1); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 0, 1}; + cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c}); + + auto pos_ret = cudf::explode_position(t, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, SingleNull) +{ + // a b + // [1, 2, 7] 100 + // [5, 6] 200 + // [] 300 + // [0, 3] 400 + + auto first_invalid = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); + + LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid); + FCW b({100, 200, 300, 400}); + + FCW expected_a{5, 6, 0, 3}; + FCW expected_b{200, 200, 400, 400}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, Nulls) +{ + // a b + // [1, 2, 7] 100 + // [5, 6] 200 + // [0, 3] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids); + FCW b({100, 200, 300}, valids); + + FCW expected_a({1, 2, 7, 0, 3}); + FCW expected_b({100, 100, 100, 300, 300}, always_valid); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, NullsInList) +{ + // a b + // [1, 2, 7] 100 + // [5, 6, 0, 9] 200 + // [] 300 + // [0, 3, 8] 400 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)}; + FCW b{100, 200, 300, 400}; + + FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); + FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, Nested) +{ + // a b + // [[1, 2], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[],[5],[2, 1]] 300 + + LCW a{LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}}; + FCW b{100, 200, 300}; + + LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}; + FCW expected_b{100, 100, 200, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2, 3}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, NestedNulls) +{ + // a b + // [[1, 2], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids); + FCW b({100, 200, 300}, valids); + + LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{0, 3}, LCW{5}, LCW{2, 1}}; + FCW expected_b({100, 100, 300, 300, 300}, always_valid); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, NullsInNested) +{ + // a b + // [[1, 2], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + FCW b({100, 200, 300}); + + LCW expected_a{ + LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + FCW expected_b{100, 100, 200, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, NullsInNestedDoubleExplode) +{ + // a b + // [[1, 2], [], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}; + FCW b{100, 200, 300}; + + FCW expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto first_explode_ret = cudf::explode(t, 0); + auto ret = cudf::explode(first_explode_ret->view(), 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, NestedStructs) +{ + // a b + // [[1, 2], [7, 6, 5]] {100, "100"} + // [[5, 6]] {200, "200"} + // [[0, 3],[5],[2, 1]] {300, "300"} + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + FCW b1({100, 200, 300}); + strings_column_wrapper b2{"100", "200", "300"}; + structs_column_wrapper b({b1, b2}); + + LCW expected_a{ + LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + FCW expected_b1{100, 100, 200, 300, 300, 300}; + strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; + structs_column_wrapper expected_b({expected_b1, expected_b2}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TYPED_TEST(ExplodeTypedTest, ListOfStructs) +{ + // a b + // [{70, "70"}, {75, "75"}] 100 + // [{50, "50"}, {55, "55"}] 200 + // [{35, "35"}, {45, "45"}] 300 + // [{25, "25"}, {30, "30"}] 400 + // [{15, "15"}, {20, "20"}] 500 + + auto numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release(); + auto a = cudf::make_lists_column( + 5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {}); + + FCW b{100, 200, 300, 400, 500}; + + cudf::table_view t({a->view(), b}); + auto ret = cudf::explode(t, 0); + + auto expected_numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper expected_string_col{ + "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + + auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release(); + FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500}; + + cudf::table_view expected({expected_a->view(), expected_b}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeTest, SlicedList) +{ + // a b + // [[1, 2],[7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + // [[8, 3],[],[4, 3, 1, 2]] 400 + // [[2, 3, 4],[9, 8]] 500 + + // slicing the top 2 rows and the bottom row off + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}, + LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}, + LCW{LCW{2, 3, 4}, LCW{9, 8}}}); + FCW b({100, 200, 300, 400, 500}); + + LCW expected_a{ + LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}; + FCW expected_b{300, 300, 300, 400, 400, 400}; + + cudf::table_view t({a, b}); + auto sliced_t = cudf::slice(t, {2, 4}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(sliced_t[0], 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(sliced_t[0], 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, Empty) +{ + LCW a{}; + FCW b{}; + + cudf::table_view t({LCW{}, FCW{}}); + + auto ret = cudf::explode_outer(t, 0); + + FCW expected_a{}; + FCW expected_b{}; + cudf::table_view expected({FCW{}, FCW{}}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeOuterTest, NonList) +{ + cudf::table_view t({FCW{100, 200, 300}, FCW{100, 200, 300}}); + + EXPECT_THROW(cudf::explode_outer(t, 1), cudf::logic_error); + EXPECT_THROW(cudf::explode_outer_position(t, 1), cudf::logic_error); +} + +TEST_F(ExplodeOuterTest, Basics) +{ + // a b c + // 100 [1, 2, 7] string0 + // 200 [5, 6] string1 + // 300 [0, 3] string2 + + FCW a{100, 200, 300}; + LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}; + strings_column_wrapper c{"string0", "string1", "string2"}; + + FCW expected_a{100, 100, 100, 200, 200, 300, 300}; + FCW expected_b{1, 2, 7, 5, 6, 0, 3}; + strings_column_wrapper expected_c{ + "string0", "string0", "string0", "string1", "string1", "string2", "string2"}; + + cudf::table_view t({a, b, c}); + cudf::table_view expected({expected_a, expected_b, expected_c}); + + auto ret = cudf::explode_outer(t, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 0, 1}; + cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c}); + + auto pos_ret = cudf::explode_outer_position(t, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, SingleNull) +{ + // a b + // [1, 2, 7] 100 + // [5, 6] 200 + // [] 300 + // [0, 3] 400 + + auto first_invalid = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); + + LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid); + FCW b({100, 200, 300, 400}); + + FCW expected_a{{0, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}}; + FCW expected_b{100, 200, 200, 300, 400, 400}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 0, 1, 0, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, Nulls) +{ + // a b + // [1, 2, 7] 100 + // [5, 6] 200 + // [0, 3] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids); + FCW b({100, 200, 300}, valids); + + FCW expected_a({1, 2, 7, 0, 0, 3}, {1, 1, 1, 0, 1, 1}); + FCW expected_b({100, 100, 100, 200, 300, 300}, {1, 1, 1, 0, 1, 1}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, NullsInList) +{ + // a b + // [1, 2, 7] 100 + // [5, 6, 0, 9] 200 + // [] 300 + // [0, 3, 8] 400 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)}; + FCW b{100, 200, 300, 400}; + + FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}); + FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, Nested) +{ + // a b + // [[1, 2], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[],[5],[2, 1]] 300 + + LCW a{LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}}; + FCW b{100, 200, 300}; + + LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}; + FCW expected_b{100, 100, 200, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2, 3}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, NestedNulls) +{ + // a b + // [[1, 2], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids); + FCW b({100, 200, 300}); + + auto expected_valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; }); + LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, expected_valids); + FCW expected_b({100, 100, 200, 300, 300, 300}); + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, NullsInNested) +{ + // a b + // [[1, 2], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + FCW b({100, 200, 300}); + + LCW expected_a{ + LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + FCW expected_b{100, 100, 200, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode) +{ + // a b + // [[1, 2], [], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}; + FCW b{100, 200, 300}; + + FCW expected_a({1, 2, 0, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto first_explode_ret = cudf::explode_outer(t, 0); + auto ret = cudf::explode_outer(first_explode_ret->view(), 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, NestedStructs) +{ + // a b + // [[1, 2], [7, 6, 5]] {100, "100"} + // [[5, 6]] {200, "200"} + // [[0, 3],[5],[2, 1]] {300, "300"} + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + FCW b1({100, 200, 300}); + strings_column_wrapper b2{"100", "200", "300"}; + structs_column_wrapper b({b1, b2}); + + LCW expected_a{ + LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + FCW expected_b1{100, 100, 200, 300, 300, 300}; + strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; + structs_column_wrapper expected_b({expected_b1, expected_b2}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs) +{ + // a b + // [{70, "70"}, {75, "75"}] 100 + // [{50, "50"}, {55, "55"}] 200 + // [{35, "35"}, {45, "45"}] 300 + // [{25, "25"}, {30, "30"}] 400 + // [{15, "15"}, {20, "20"}] 500 + + auto numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release(); + auto a = cudf::make_lists_column( + 5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {}); + + FCW b{100, 200, 300, 400, 500}; + + cudf::table_view t({a->view(), b}); + auto ret = cudf::explode_outer(t, 0); + + auto expected_numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper expected_string_col{ + "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + + auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release(); + FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500}; + + cudf::table_view expected({expected_a->view(), expected_b}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, SlicedList) +{ + // a b + // [[1, 2],[7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, 1]] 300 + // [[8, 3],[],[4, 3, 1, 2]] 400 + // [[2, 3, 4],[9, 8]] 500 + + // slicing the top 2 rows and the bottom row off + + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW{LCW{5, 6}}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}, + LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}, + LCW{LCW{2, 3, 4}, LCW{9, 8}}}); + FCW b({100, 200, 300, 400, 500}); + + LCW expected_a{ + LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}; + FCW expected_b{300, 300, 300, 400, 400, 400}; + + cudf::table_view t({a, b}); + auto sliced_t = cudf::slice(t, {2, 4}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(sliced_t[0], 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(sliced_t[0], 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp deleted file mode 100644 index 5f3237ce46d..00000000000 --- a/cpp/tests/reshape/explode_tests.cpp +++ /dev/null @@ -1,530 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include - -using namespace cudf::test; - -class ExplodeTest : public cudf::test::BaseFixture { -}; - -template -class ExplodeTypedTest : public cudf::test::BaseFixture { -}; - -TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes); - -TEST_F(ExplodeTest, Empty) -{ - lists_column_wrapper a{}; - fixed_width_column_wrapper b{}; - - cudf::table_view t({a, b}); - - auto ret = cudf::explode(t, 0); - - fixed_width_column_wrapper expected_a{}; - fixed_width_column_wrapper expected_b{}; - cudf::table_view expected({expected_a, expected_b}); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - auto pos_ret = cudf::explode_position(t, 0); - - fixed_width_column_wrapper expected_c{}; - cudf::table_view pos_expected({expected_a, expected_b, expected_c}); - - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, NonList) -{ - fixed_width_column_wrapper a{100, 200, 300}; - fixed_width_column_wrapper b{100, 200, 300}; - - cudf::table_view t({a, b}); - - EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error); - EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error); -} - -TEST_F(ExplodeTest, Basics) -{ - /* - a b - [1, 2, 7] 100 - [5, 6] 200 - [0, 3] 300 - */ - - fixed_width_column_wrapper a{100, 200, 300}; - lists_column_wrapper b{lists_column_wrapper{1, 2, 7}, - lists_column_wrapper{5, 6}, - lists_column_wrapper{0, 3}}; - strings_column_wrapper c{"string0", "string1", "string2"}; - - fixed_width_column_wrapper expected_a{100, 100, 100, 200, 200, 300, 300}; - fixed_width_column_wrapper expected_b{1, 2, 7, 5, 6, 0, 3}; - strings_column_wrapper expected_c{ - "string0", "string0", "string0", "string1", "string1", "string2", "string2"}; - - cudf::table_view t({a, b, c}); - cudf::table_view expected({expected_a, expected_b, expected_c}); - - auto ret = cudf::explode(t, 1); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1, 0, 1}; - cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c}); - - auto pos_ret = cudf::explode_position(t, 1); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, SingleNull) -{ - /* - a b - [1, 2, 7] 100 - [5, 6] 200 - [] 300 - [0, 3] 400 - */ - - auto first_invalid = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); - - lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, - lists_column_wrapper{5, 6}, - lists_column_wrapper{}, - lists_column_wrapper{0, 3}}, - first_invalid); - fixed_width_column_wrapper b({100, 200, 300, 400}); - - fixed_width_column_wrapper expected_a{5, 6, 0, 3}; - fixed_width_column_wrapper expected_b{200, 200, 400, 400}; - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, Nulls) -{ - /* - a b - [1, 2, 7] 100 - [5, 6] 200 - [0, 3] 300 - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - auto always_valid = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, - lists_column_wrapper{5, 6}, - lists_column_wrapper{0, 3}}, - valids); - fixed_width_column_wrapper b({100, 200, 300}, valids); - - fixed_width_column_wrapper expected_a({1, 2, 7, 0, 3}); - fixed_width_column_wrapper expected_b({100, 100, 100, 300, 300}, always_valid); - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, NullsInList) -{ - /* - a b - [1, 2, 7] 100 - [5, 6, 0, 9] 200 - [] 300 - [0, 3, 8] 400 - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - - lists_column_wrapper a{lists_column_wrapper({1, 2, 7}, valids), - lists_column_wrapper({5, 6, 0, 9}, valids), - lists_column_wrapper{}, - lists_column_wrapper({0, 3, 8}, valids)}; - fixed_width_column_wrapper b{100, 200, 300, 400}; - - fixed_width_column_wrapper expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, - {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); - fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400}; - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, Nested) -{ - /* - a b - [[1, 2], [7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[],[5],[2, 1]] 300 - */ - - lists_column_wrapper a{ - lists_column_wrapper{lists_column_wrapper{1, 2}, - lists_column_wrapper{7, 6, 5}}, - lists_column_wrapper{lists_column_wrapper{5, 6}}, - lists_column_wrapper{lists_column_wrapper{0, 3}, - lists_column_wrapper{}, - lists_column_wrapper{5}, - lists_column_wrapper{2, 1}}}; - fixed_width_column_wrapper b{100, 200, 300}; - - lists_column_wrapper expected_a{lists_column_wrapper{1, 2}, - lists_column_wrapper{7, 6, 5}, - lists_column_wrapper{5, 6}, - lists_column_wrapper{0, 3}, - lists_column_wrapper{}, - lists_column_wrapper{5}, - lists_column_wrapper{2, 1}}; - fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300, 300}; - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 0, 1, 2, 3}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, NestedNulls) -{ - /* - a b - [[1, 2], [7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - auto always_valid = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - lists_column_wrapper a( - {lists_column_wrapper{lists_column_wrapper{1, 2}, - lists_column_wrapper{7, 6, 5}}, - lists_column_wrapper{lists_column_wrapper{5, 6}}, - lists_column_wrapper{lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper{2, 1}}}, - valids); - fixed_width_column_wrapper b({100, 200, 300}, valids); - - lists_column_wrapper expected_a{lists_column_wrapper{1, 2}, - lists_column_wrapper{7, 6, 5}, - lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper{2, 1}}; - fixed_width_column_wrapper expected_b({100, 100, 300, 300, 300}, always_valid); - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1, 2}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, NullsInNested) -{ - /* - a b - [[1, 2], [7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - - lists_column_wrapper a( - {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), - lists_column_wrapper{7, 6, 5}}, - lists_column_wrapper{lists_column_wrapper{5, 6}}, - lists_column_wrapper{lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids)}}); - fixed_width_column_wrapper b({100, 200, 300}); - - lists_column_wrapper expected_a{lists_column_wrapper({1, 2}, valids), - lists_column_wrapper{7, 6, 5}, - lists_column_wrapper{5, 6}, - lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids)}; - fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300}; - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 0, 1, 2}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, NullsInNestedDoubleExplode) -{ - /* - a b - [[1, 2], [], [7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - - lists_column_wrapper a{ - lists_column_wrapper{lists_column_wrapper({1, 2}, valids), - lists_column_wrapper{}, - lists_column_wrapper{7, 6, 5}}, - lists_column_wrapper{lists_column_wrapper{5, 6}}, - lists_column_wrapper{lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids)}}; - fixed_width_column_wrapper b{100, 200, 300}; - - fixed_width_column_wrapper expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); - fixed_width_column_wrapper expected_b{ - 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto first_explode_ret = cudf::explode(t, 0); - auto ret = cudf::explode(first_explode_ret->view(), 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, NestedStructs) -{ - /* - a b - [[1, 2], [7, 6, 5]] {100, "100"} - [[5, 6]] {200, "200"} - [[0, 3],[5],[2, 1]] {300, "300"} - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - - lists_column_wrapper a( - {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), - lists_column_wrapper{7, 6, 5}}, - lists_column_wrapper{lists_column_wrapper{5, 6}}, - lists_column_wrapper{lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids)}}); - fixed_width_column_wrapper b1({100, 200, 300}); - strings_column_wrapper b2{"100", "200", "300"}; - structs_column_wrapper b({b1, b2}); - - lists_column_wrapper expected_a{lists_column_wrapper({1, 2}, valids), - lists_column_wrapper{7, 6, 5}, - lists_column_wrapper{5, 6}, - lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids)}; - fixed_width_column_wrapper expected_b1{100, 100, 200, 300, 300, 300}; - strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; - structs_column_wrapper expected_b({expected_b1, expected_b2}); - - cudf::table_view t({a, b}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(t, 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 0, 1, 2}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TYPED_TEST(ExplodeTypedTest, ListOfStructs) -{ - /* - a b - [{70, "70"}, {75, "75"}] 100 - [{50, "50"}, {55, "55"}] 200 - [{35, "35"}, {45, "45"}] 300 - [{25, "25"}, {30, "30"}] 400 - [{15, "15"}, {20, "20"}] 500 -*/ - - auto numeric_col = - fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; - strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; - auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release(); - auto a = cudf::make_lists_column(5, - fixed_width_column_wrapper{0, 2, 4, 6, 8, 10}.release(), - std::move(struct_col), - cudf::UNKNOWN_NULL_COUNT, - {}); - - fixed_width_column_wrapper b{100, 200, 300, 400, 500}; - - cudf::table_view t({a->view(), b}); - auto ret = cudf::explode(t, 0); - - auto expected_numeric_col = - fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; - strings_column_wrapper expected_string_col{ - "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; - - auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release(); - fixed_width_column_wrapper expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500}; - - cudf::table_view expected({expected_a->view(), expected_b}); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; - cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b}); - - auto pos_ret = cudf::explode_position(t, 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} - -TEST_F(ExplodeTest, SlicedList) -{ - /* - a b - [[1, 2],[7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 - [[8, 3],[],[4, 3, 1, 2]] 400 - [[2, 3, 4],[9, 8]] 500 - - slicing the top 2 rows and the bottom row off - */ - - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); - - lists_column_wrapper a( - {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), - lists_column_wrapper{7, 6, 5}}, - lists_column_wrapper{lists_column_wrapper{5, 6}}, - lists_column_wrapper{lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids)}, - lists_column_wrapper{lists_column_wrapper{8, 3}, - lists_column_wrapper{}, - lists_column_wrapper({4, 3, 1, 2}, valids)}, - lists_column_wrapper{lists_column_wrapper{2, 3, 4}, - lists_column_wrapper{9, 8}}}); - fixed_width_column_wrapper b({100, 200, 300, 400, 500}); - - lists_column_wrapper expected_a{lists_column_wrapper{0, 3}, - lists_column_wrapper{5}, - lists_column_wrapper({2, 1}, valids), - lists_column_wrapper{8, 3}, - lists_column_wrapper{}, - lists_column_wrapper({4, 3, 1, 2}, valids)}; - fixed_width_column_wrapper expected_b{300, 300, 300, 400, 400, 400}; - - cudf::table_view t({a, b}); - auto sliced_t = cudf::slice(t, {2, 4}); - cudf::table_view expected({expected_a, expected_b}); - - auto ret = cudf::explode(sliced_t[0], 0); - - CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); - - fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1, 2}; - cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); - - auto pos_ret = cudf::explode_position(sliced_t[0], 0); - CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); -} diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd new file mode 100644 index 00000000000..cd2d44d2e42 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/explode.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type + +cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil: + cdef unique_ptr[table] explode_outer( + const table_view, + size_type explode_column_idx, + ) except + diff --git a/python/cudf/cudf/_lib/cpp/reshape.pxd b/python/cudf/cudf/_lib/cpp/reshape.pxd index 2985b9282b3..3486fd6193d 100644 --- a/python/cudf/cudf/_lib/cpp/reshape.pxd +++ b/python/cudf/cudf/_lib/cpp/reshape.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr @@ -14,3 +14,6 @@ cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil: cdef unique_ptr[table] tile( table_view source_table, size_type count ) except + + cdef unique_ptr[table] explode( + table_view input_table, size_type explode_column_idx, + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index aba13580912..56b89e9244c 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -6,12 +6,19 @@ from libcpp.utility cimport move from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements ) +from cudf._lib.cpp.lists.explode cimport ( + explode_outer as cpp_explode_outer +) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column -from cudf._lib.column cimport Column +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type +from cudf._lib.column cimport Column +from cudf._lib.table cimport Table from cudf.core.dtypes import ListDtype @@ -32,3 +39,19 @@ def count_elements(Column col): result = Column.from_unique_ptr(move(c_result)) return result + + +def explode_outer(Table tbl, int explode_column_idx): + cdef table_view c_table_view = tbl.view() + cdef size_type c_explode_column_idx = explode_column_idx + + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) + + return Table.from_unique_ptr( + move(c_result), + column_names=tbl._column_names, + index_names=tbl._index_names + ) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx index cebe48eb697..a89bee4e14f 100644 --- a/python/cudf/cudf/_lib/reshape.pyx +++ b/python/cudf/cudf/_lib/reshape.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -12,7 +12,8 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.reshape cimport ( interleave_columns as cpp_interleave_columns, - tile as cpp_tile + tile as cpp_tile, + explode as cpp_explode ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 25f57748765..0a1a8d7ba0c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7709,6 +7709,56 @@ def equals(self, other): return False return super().equals(other) + def explode(self, column, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index + values. + + Parameters + ---------- + column : str or tuple + Column to explode. Now only supports one column + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + + Notes + ------- + In cudf, empty lists `[]` are mapped to nulls, as opposed to `nan` in + Pandas. + + Examples + ------- + >>> import cudf + >>> cudf.DataFrame( + {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]} + ) + a b + 0 [1, 2, 3] 11 + 1 [] 22 + 2 None 33 + 3 [4, 5] 44 + >>> df.explode('a') + a b + 0 1 11 + 0 2 11 + 0 3 11 + 1 22 + 2 33 + 3 4 44 + 3 5 44 + """ + if column not in self._column_names: + raise KeyError(column) + + explode_num = self._column_names.index(column) + return super()._explode( + explode_num, None if ignore_index else self.index + ) + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fab5936f94d..22c47213138 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,16 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, + Tuple, + TypeVar, + Union, + overload, +) import cupy import numpy as np @@ -573,6 +582,18 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + def _explode(self, explode_column_num: int, index: Optional[cudf.Index]): + if index is not None: + explode_column_num += index.nlevels + res_tbl = libcudf.lists.explode_outer( + cudf._lib.table.Table(self._data, index=index), explode_column_num + ) + + res = self.__class__._from_table(res_tbl) + if index is not None: + res.index.names = index.names + return res + def _get_columns_by_label(self, labels, downcast): """ Returns columns of the Frame specified by `labels` diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5e7121c0488..4aaf2c0f94d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6364,6 +6364,48 @@ def keys(self): """ return self.index + def explode(self, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index + values. + + Parameters + ---------- + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + + Notes + ------- + In cudf, empty lists `[]` are mapped to nulls, as opposed to `nan` in + Pandas. + + Examples + ------- + >>> import cudf + >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]]) + >>> s + 0 [1, 2, 3] + 1 [] + 2 None + 3 [4, 5] + dtype: list + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 + 2 + 3 4 + 3 5 + dtype: int64 + """ + + return super()._explode(0, None if ignore_index else self.index) + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 77548b95277..e2e6c469949 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8442,3 +8442,31 @@ def test_rename_for_level_is_None_MC(): got = gdf.rename(columns={"a": "f"}, level=None) assert_eq(expect, got) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(ignore_index, p_index): + gdf = cudf.DataFrame( + { + "a": [[1, 2, 3], None, [4], [], [5, 6]], + "b": [11, 22, 33, 44, 55], + "c": ["a", "e", "i", "o", "u"], + }, + index=p_index, + ) + pdf = gdf.to_pandas(nullable=True) + + expect = pdf.explode("a", ignore_index).fillna(pd.NA) + got = gdf.explode("a", ignore_index) + + assert_eq(got, expect, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a1b4236719d..d8531657177 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1118,3 +1118,24 @@ def test_series_drop_raises(): actual = gs.drop("p", errors="ignore") assert_eq(actual, expect) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(ignore_index, p_index): + gdf = cudf.Series([[1, 2, 3], None, [4], [], [5, 6]], index=p_index) + pdf = gdf.to_pandas(nullable=True) + + expect = pdf.explode(ignore_index) + got = gdf.explode(ignore_index) + + assert_eq(expect, got, check_dtype=False)