diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0806bb964cf..68008e13897 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -79,6 +79,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh - test -f $PREFIX/include/cudf/detail/concatenate.hpp - test -f $PREFIX/include/cudf/detail/copy.hpp + - test -f $PREFIX/include/cudf/detail/copy.cuh - test -f $PREFIX/include/cudf/detail/datetime.hpp - test -f $PREFIX/include/cudf/detail/fill.hpp - test -f $PREFIX/include/cudf/detail/gather.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 15caaec9bec..cbe2811afe4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -238,6 +238,7 @@ add_library( src/copying/gather.cu src/copying/get_element.cu src/copying/pack.cpp + src/copying/purge_nonempty_nulls.cu src/copying/reverse.cu src/copying/sample.cu src/copying/scatter.cu diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 2e559afef4f..8f1ad7da9b6 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -17,7 +17,10 @@ #pragma once #include +#include #include +#include +#include #include #include @@ -939,5 +942,155 @@ std::unique_ptr sample( int64_t const seed = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Checks if a column or its descendants have non-empty null rows + * + * @note This function is exact. If it returns `true`, there exists one or more + * non-empty null elements. + * + * A LIST or STRING column might have non-empty rows that are marked as null. + * A STRUCT OR LIST column might have child columns that have non-empty null rows. + * Other types of columns are deemed incapable of having non-empty null rows. + * E.g. Fixed width columns have no concept of an "empty" row. + * + * @param input The column which is (and whose descendants are) to be checked for + * non-empty null rows. + * @return true If either the column or its descendants have non-empty null rows. + * @return false If neither the column or its descendants have non-empty null rows. + */ +bool has_nonempty_nulls(column_view const& input); + +/** + * @brief Approximates if a column or its descendants *may* have non-empty null elements + * + * @note This function is approximate. + * - `true`: Non-empty null elements could exist + * - `false`: Non-empty null elements definitely do not exist + * + * False positives are possible, but false negatives are not. + * + * Compared to the exact `has_nonempty_nulls()` function, this function is typically + * more efficient. + * + * Complexity: + * - Best case: `O(count_descendants(input))` + * - Worst case: `O(count_descendants(input)) * m`, where `m` is the number of rows in the largest + * descendant + * + * @param input The column which is (and whose descendants are) to be checked for + * non-empty null rows + * @return true If either the column or its decendants have null rows + * @return false If neither the column nor its descendants have null rows + */ +bool may_have_nonempty_nulls(column_view const& input); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * LIST columns may have non-empty null rows. + * For example: + * @code{.pseudo} + * + * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }.release(); + * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false); + * + * lists[1] is now null, but the lists child column still stores `{2,3}`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [0, 1, 2, 3, 4, 5] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [0, 1, 4, 5] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + lists_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * STRING columns may have non-empty null rows. + * For example: + * @code{.pseudo} + * + * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release(); + * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false); + * + * strings[1] is now null, but the strings column still stores `"CD"`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [A, B, C, D, E, F] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [A, B, E, F] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + strings_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * STRUCTS columns may have null rows, with non-empty child rows. + * For example: + * @code{.pseudo} + * + * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }; + * auto const structs = structs_column_wrapper{ {lists}, null_at(1) }; + * + * structs[1].child is now null, but the lists column still stores `{2,3}`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [0, 1, 2, 3, 4, 5] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [0, 1, 4, 5] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + structs_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ } // namespace cudf diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh new file mode 100644 index 00000000000..773bce7131f --- /dev/null +++ b/cpp/include/cudf/detail/copy.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cudf::detail { + +/** + * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) + * + * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view) + * @param stream CUDA stream used for device memory operations and kernel launches + */ +template +std::unique_ptr purge_nonempty_nulls(ColumnViewT const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Implement via identity gather. + auto const input_column = input.parent(); + auto const gather_begin = thrust::counting_iterator(0); + auto const gather_end = gather_begin + input_column.size(); + + auto gathered_table = cudf::detail::gather(table_view{{input_column}}, + gather_begin, + gather_end, + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(gathered_table->release()[0]); +} + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index 50157d16876..abd14fbda89 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -299,5 +299,22 @@ std::unique_ptr get_element( size_type index, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::has_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +bool has_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::may_have_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +bool may_have_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index c637ad041ba..7df36be2385 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, auto dst_offsets_c = cudf::make_fixed_width_column( data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view(); + auto const source_column_nullmask = source_column.null_mask(); // generate the compacted outgoing offsets. auto count_iter = thrust::make_counting_iterator(0); @@ -90,12 +92,23 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, count_iter, count_iter + offset_count, dst_offsets_v.begin(), - [gather_map, output_count, src_offsets, src_size] __device__(int32_t index) -> int32_t { + [source_column_nullmask, + source_column_offset = source_column.offset(), + gather_map, + output_count, + src_offsets, + src_size] __device__(int32_t index) -> int32_t { int32_t offset_index = index < output_count ? gather_map[index] : 0; // if this is an invalid index, this will be a NULL list if (NullifyOutOfBounds && ((offset_index < 0) || (offset_index >= src_size))) { return 0; } + // If the source row is null, the output row size must be 0. + if (source_column_nullmask != nullptr && + not cudf::bit_is_set(source_column_nullmask, source_column_offset + offset_index)) { + return 0; + } + // the length of this list return src_offsets[offset_index + 1] - src_offsets[offset_index]; }, @@ -110,15 +123,27 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the base offsets rmm::device_uvector base_offsets = rmm::device_uvector(output_count, stream); - thrust::transform(rmm::exec_policy(stream), - gather_map, - gather_map + output_count, - base_offsets.data(), - [src_offsets, src_size, shift] __device__(int32_t index) { - // if this is an invalid index, this will be a NULL list - if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; } - return src_offsets[index] - shift; - }); + thrust::transform( + rmm::exec_policy(stream), + gather_map, + gather_map + output_count, + base_offsets.data(), + [source_column_nullmask, + source_column_offset = source_column.offset(), + src_offsets, + src_size, + shift] __device__(int32_t index) { + // if this is an invalid index, this will be a NULL list + if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; } + + // If the source row is null, the output row size must be 0. + if (source_column_nullmask != nullptr && + not cudf::bit_is_set(source_column_nullmask, source_column_offset + index)) { + return 0; + } + + return src_offsets[index] - shift; + }); // Retrieve size of the resulting gather map for level N+1 (the last offset) size_type child_gather_map_size = diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 1b10c70d6d6..d46ab3a91a1 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -303,14 +303,17 @@ std::unique_ptr gather( data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr); auto const d_out_offsets = out_offsets_column->mutable_view().template data(); auto const d_in_offsets = (strings_count > 0) ? strings.offsets_begin() : nullptr; - thrust::transform(rmm::exec_policy(stream), - begin, - end, - d_out_offsets, - [d_in_offsets, strings_count] __device__(size_type in_idx) { - if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; - return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; - }); + auto const d_strings = column_device_view::create(strings.parent(), stream); + thrust::transform( + rmm::exec_policy(stream), + begin, + end, + d_out_offsets, + [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) { + if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; + if (not d_strings.is_valid(in_idx)) return 0; + return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; + }); // check total size is not too large size_t const total_bytes = thrust::transform_reduce( @@ -329,7 +332,6 @@ std::unique_ptr gather( // build chars column cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto const d_strings = column_device_view::create(strings.parent(), stream); auto out_chars_column = gather_chars(d_strings->begin(), begin, end, diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 329c24cfe0a..ca866d8555e 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,11 @@ class structs_column_view : public column_view { explicit structs_column_view(column_view const& rhs); + /** + * @brief Returns the parent column. + */ + [[nodiscard]] column_view parent() const; + using column_view::child_begin; using column_view::child_end; using column_view::has_nulls; diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu new file mode 100644 index 00000000000..778d6c4df55 --- /dev/null +++ b/cpp/src/copying/purge_nonempty_nulls.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include + +namespace cudf { +namespace detail { + +using cudf::type_id; + +namespace { + +/// Check if nonempty-null checks can be skipped for a given type. +bool type_may_have_nonempty_nulls(cudf::type_id const& type) +{ + return type == type_id::STRING || type == type_id::LIST || type == type_id::STRUCT; +} + +/// Check if the (STRING/LIST) column has any null rows with non-zero length. +bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_view stream) +{ + if (not input.has_nulls()) { return false; } // No nulls => no dirty rows. + + // Cross-reference nullmask and offsets. + auto const type = input.type().id(); + auto const offsets = (type == type_id::STRING) ? (strings_column_view{input}).offsets() + : (lists_column_view{input}).offsets(); + auto const d_input = cudf::column_device_view::create(input); + auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin()] __device__( + size_type const& row_idx) { + return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]); + }; + + auto const row_begin = thrust::counting_iterator(0); + auto const row_end = row_begin + input.size(); + return thrust::count_if(rmm::exec_policy(stream), row_begin, row_end, is_dirty_row) > 0; +} + +} // namespace + +/** + * @copydoc cudf::detail::has_nonempty_nulls + */ +bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view stream) +{ + auto const type = input.type().id(); + + if (not type_may_have_nonempty_nulls(type)) { return false; } + + // For types with variable-length rows, check if any rows are "dirty". + // A dirty row is a null row with non-zero length. + if ((type == type_id::STRING || type == type_id::LIST) && has_nonempty_null_rows(input, stream)) { + return true; + } + + // For complex types, check if child columns need purging. + if ((type == type_id::STRUCT || type == type_id::LIST) && + std::any_of(input.child_begin(), input.child_end(), [stream](auto const& child) { + return cudf::detail::has_nonempty_nulls(child, stream); + })) { + return true; + } + + return false; +} +} // namespace detail + +/** + * @copydoc cudf::may_have_nonempty_nulls + */ +bool may_have_nonempty_nulls(column_view const& input) +{ + auto const type = input.type().id(); + + if (not detail::type_may_have_nonempty_nulls(type)) { return false; } + + if ((type == type_id::STRING || type == type_id::LIST) && input.has_nulls()) { return true; } + + if ((type == type_id::STRUCT || type == type_id::LIST) && + std::any_of(input.child_begin(), input.child_end(), may_have_nonempty_nulls)) { + return true; + } + + return false; +} + +/** + * @copydoc cudf::has_nonempty_nulls + */ +bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); } + +/** + * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(lists_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(structs_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(strings_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index 681f13386ff..7d8c8837d2d 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -25,6 +25,8 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns"); } +column_view structs_column_view::parent() const { return *this; } + column_view structs_column_view::get_sliced_child(int index) const { std::vector children; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e016f47616b..95c54d7596e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -250,6 +250,7 @@ ConfigureTest( copying/gather_tests.cpp copying/get_value_tests.cpp copying/pack_tests.cpp + copying/purge_nonempty_nulls_tests.cpp copying/sample_tests.cpp copying/scatter_tests.cpp copying/scatter_list_tests.cpp diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 4e0e70bf15c..44a79e63cd8 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -645,7 +645,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested) 0, cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *expected); } TYPED_TEST(ListsStructsLeafTest, FromNested) diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp new file mode 100644 index 00000000000..77fd3f66ee5 --- /dev/null +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::test { + +using iterators::no_nulls; +using iterators::null_at; +using iterators::nulls_at; +using T = int32_t; // The actual type of the leaf node isn't really important. +using values_col_t = fixed_width_column_wrapper; +using offsets_col_t = fixed_width_column_wrapper; +using gather_map_t = fixed_width_column_wrapper; + +template +using LCW = cudf::test::lists_column_wrapper; + +struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture { + /// Helper to run gather() on a single column, and extract the single column from the result. + std::unique_ptr gather(column_view const& input, gather_map_t const& gather_map) + { + auto gathered = + cudf::gather(cudf::table_view{{input}}, gather_map, out_of_bounds_policy::NULLIFY); + return std::move(gathered->release()[0]); + } + + /// Verify that the result of `sanitize()` is equivalent to the unsanitized input, + /// except that the null rows are also empty. + template + void test_purge(ColumnViewT const& unpurged) + { + auto const purged = cudf::purge_nonempty_nulls(unpurged); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged.parent(), *purged); + EXPECT_FALSE(cudf::has_nonempty_nulls(*purged)); + } +}; + +// List. +TEST_F(PurgeNonEmptyNullsTest, SingleLevelList) +{ + auto const input = LCW{{{{1, 2, 3, 4}, null_at(2)}, + {5}, + {6, 7}, // <--- Will be set to NULL. Unsanitized row. + {8, 9, 10}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Selecting all rows from input, in different order. + auto const results = gather(input->view(), {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + + auto const expected = LCW{{{5}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + {8, 9, 10}}, + null_at(1)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.child(), + values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects rows preceded by unsanitized rows. + auto const results = gather(input->view(), {3, 100, 0}); + auto const expected = LCW{{ + {8, 9, 10}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + }, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects rows followed by unsanitized rows. + auto const results = gather(input->view(), {1, 100, 0}); + auto const expected = LCW{{ + {5}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + }, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects unsanitized row specifically. + auto const results = gather(input->view(), {2}); + auto const results_lists_view = lists_column_view(*results); + auto const expected = LCW{{ + LCW{} // NULL. + }, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), values_col_t{}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List>. +TEST_F(PurgeNonEmptyNullsTest, TwoLevelList) +{ + auto const input = + LCW{ + {{{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}}, + {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}, + {{21}, {22, 23}, {24, 25, 26}}, + {{31, 32}, {33, 34, 35, 36}, {}, {37, 38}}, //<--- Will be set to NULL. Unsanitized row. + {{41}, {42, 43}}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Verify that gather() output is sanitized. + auto const results = gather(input->view(), {100, 3, 0, 1}); + auto const results_lists_view = lists_column_view(*results); + + auto const expected = LCW{{ + LCW{}, // NULL, because of out of bounds. + LCW{}, // NULL, because input row was null. + {{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}}, // i.e. input[0] + {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}} // i.e. input[1] + }, + nulls_at({0, 1})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_lists_view.child(), + LCW{ + {1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}, {11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}); + + auto const child_lists_view = lists_column_view(results_lists_view.child()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_lists_view.offsets(), + offsets_col_t{0, 3, 7, 8, 10, 11, 13, 16, 19, 20}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + child_lists_view.child(), + values_col_t{1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 11, 12, 13, 14, 15, 16, 17, 18, 19}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List>>. +TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList) +{ + auto const input = LCW{{{{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}}, + {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}}, + {{{21, 21}}, {{22, 23}, {}}, {{24, 25}, {26}}}, + {{{31, 32}, {}}, + {{33, 34, 35}, {36}}, + {}, + {{37, 38}}}, //<--- Will be set to NULL. Unsanitized row. + {{{41, 41, 41}}, {{42, 43}}}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + auto const results = gather(input->view(), {100, 3, 0, 1}); + auto const results_lists_view = lists_column_view(*results); + + auto const expected = LCW{ + { + LCW{}, // NULL, because of out of bounds. + LCW{}, // NULL, because input row was null. + {{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}}, // i.e. input[0] + {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}} // i.e. input[1] + }, + nulls_at({0, 1})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), + LCW{{{1, 2}, {3}}, + {{4, 5}, {6, 7}}, + {{8, 8}, {}}, + {{9, 1}}, + {{2, 3}}, + {{11, 12}}, + {{13}, {14, 15}}, + {{16, 17, 18}}, + {{19, 19}, {}}}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List. +TEST_F(PurgeNonEmptyNullsTest, ListOfStrings) +{ + using T = string_view; + + auto const input = LCW{{{{"1", "22", "", "4444"}, null_at(2)}, + {"55555"}, + {"666666", "7777777"}, // <--- Will be set to NULL. Unsanitized row. + {"88888888", "999999999", "1010101010"}, + {"11", "22", "33", "44"}, + {"55", "66", "77", "88"}}, + no_nulls()} + .release(); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Selecting all rows from input, in different order. + auto const results = gather(input->view(), {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + + auto const expected = LCW{{{"55555"}, + {}, // NULL. + {{"1", "22", "", "4444"}, null_at(2)}, + {"88888888", "999999999", "1010101010"}}, + null_at(1)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_list_view.child(), + strings_column_wrapper{ + {"55555", "1", "22", "", "4444", "88888888", "999999999", "1010101010"}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Gathering from a sliced column. + auto const sliced = cudf::slice({input->view()}, {1, 5})[0]; // Lop off 1 row at each end. + EXPECT_TRUE(cudf::may_have_nonempty_nulls(sliced)); + EXPECT_TRUE(cudf::has_nonempty_nulls(sliced)); + + auto const results = gather(sliced, {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + auto const expected = LCW{{ + {}, + {"88888888", "999999999", "1010101010"}, + {"55555"}, + {"11", "22", "33", "44"}, + }, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 0, 3, 4, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_list_view.child(), + strings_column_wrapper{ + "88888888", "999999999", "1010101010", "55555", "11", "22", "33", "44"}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List. +TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings) +{ + auto strings = + strings_column_wrapper{ + {"1", "22", "3", "44", "5", "66", "7", "8888", "9", "1010"}, //<--- "8888" will be + // unsanitized. + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*strings)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*strings)); + + // Set strings nullmask, post construction. + set_null_mask(strings->mutable_view().null_mask(), 7, 8, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*strings)); + + test_purge(strings_column_view{*strings}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + strings_column_view(*strings).offsets(), offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19} + // 10-14 indicates that "8888" is unsanitized. + ); + + // Construct a list column from the strings column. + auto const lists = make_lists_column(4, + offsets_col_t{0, 4, 5, 7, 10}.release(), + std::move(strings), + 0, + detail::make_null_mask(no_nulls(), no_nulls() + 4)); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); + + // Set lists nullmask, post construction. + cudf::detail::set_null_mask(lists->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); + + test_purge(lists_column_view{*lists}); + + // At this point, + // 1. {"66", "7"} will be unsanitized. + // 2. {"8888", "9", "1010"} will be actually be {NULL, "9", "1010"}. + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + lists_column_view(*lists).offsets(), + offsets_col_t{0, 4, 5, 7, 10}); // 5-7 indicates that list row#2 is unsanitized. + + auto const result = gather(lists->view(), {1, 2, 0, 3}); + auto const expected = LCW{{{"5"}, + {}, // NULL. + {"1", "22", "3", "44"}, + {{"", "9", "1010"}, null_at(0)}}, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + + // Ensure row#2 has been sanitized. + auto const results_lists_view = lists_column_view(*result); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 1, 1, 5, 8} + // 1-1 indicates that row#2 is sanitized. + ); + + // Ensure that "8888" has been sanitized, and stored as "". + auto const child_strings_view = strings_column_view(results_lists_view.child()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_strings_view.offsets(), + offsets_col_t{0, 1, 2, 4, 5, 7, 7, 8, 12}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*result)); +} + +// Struct>. +TEST_F(PurgeNonEmptyNullsTest, StructOfList) +{ + auto const structs_input = + [] { + auto child = LCW{{{{1, 2, 3, 4}, null_at(2)}, + {5}, + {6, 7}, //<--- Unsanitized row. + {8, 9, 10}}, + no_nulls()}; + EXPECT_FALSE(cudf::has_nonempty_nulls(child)); + return structs_column_wrapper{{child}, null_at(2)}; + }() + .release(); + + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*structs_input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*structs_input)); + + test_purge(structs_column_view{*structs_input}); + + // At this point, even though the structs column has a null at index 2, + // the child column has a non-empty list row at index 2: {6, 7}. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lists_column_view(structs_input->child(0)).child(), + values_col_t{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, null_at(2)}); + + { + // Test rearrange. + auto const gather_map = gather_map_t{1, 2, 0, 3}; + auto const result = gather(structs_input->view(), gather_map); + auto const expected_result = [] { + auto child = LCW{{{5}, + LCW{}, //<--- Now, sanitized. + {{1, 2, 3, 4}, null_at(2)}, + {8, 9, 10}}, + null_at(1)}; + return structs_column_wrapper{{child}, null_at(1)}; + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_result); + auto const results_child = lists_column_view(result->child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.child(), + values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*result)); + } +} + +} // namespace cudf::test