From 84f88ceb18225850835a9912a18e4c82245d5620 Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 28 Apr 2022 23:45:40 -0700 Subject: [PATCH 01/23] Support purging non-empty null elements from LIST/STRING columns (#10701) Fixes #10291. With certain operations in `libcudf`, it is possible to produce `LIST` columns with `NULL` rows that are not also empty. For instance, consider a `STRUCT` column is constructed with an explicit validity buffer and a `LIST` child column: ```c++ auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }; auto const structs = structs_column_wrapper{ {lists}, null_at(1) }; ``` Since `structs[1] == NULL`, its `LIST` member is also deemed null. However, for efficiency, the null-ness is recorded in the `LIST`'s validity buffer, without purging the unnecessary values from its child. The `LIST` columns appears as follows: ``` Validity: 101 Offsets: [0, 2, 4, 6] Child: [0, 1, 2, 3, 4, 5] ``` Even though Row#1 is null, its size is `4-2 = 2`, and not `0`. (Row#1 is thus a non-empty null row.) This commit adds a `cudf::purge_nonempty_nulls()` function that purges such rows, and reduces such columns to a more space-efficient representation, i.e.: ``` Validity: 101 Offsets: [0, 2, 2, 4] Child: [0, 1, 4, 5] ``` This commit also modifies `cudf::gather()` not to produce `STRING`/`LIST` columns with "dirty" rows. Further, it adds two new functions to determine if a specified column needs such purging: 1. `cudf::may_have_nonempty_nulls()`: A fast check to check a column for the *possibility* of having non-empty nulls. This only checks whether the column or its descendants have null rows at all. If there are no nulls anywhere in the hierarchy, it does not need purging. 2. `cudf::has_nonempty_nulls()`: A deeper, more expensive check that categorically confirms whether non-empty null rows exist in any column in the hierarchy. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - https://github.com/nvdbaranec - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/cudf/pull/10701 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/copying.hpp | 153 ++++++ cpp/include/cudf/detail/copy.cuh | 47 ++ cpp/include/cudf/detail/copy.hpp | 19 +- cpp/include/cudf/lists/detail/gather.cuh | 45 +- cpp/include/cudf/strings/detail/gather.cuh | 20 +- .../cudf/structs/structs_column_view.hpp | 7 +- cpp/src/copying/purge_nonempty_nulls.cu | 134 ++++++ cpp/src/structs/structs_column_view.cpp | 2 + cpp/tests/CMakeLists.txt | 1 + cpp/tests/column/factories_test.cpp | 2 +- .../copying/purge_nonempty_nulls_tests.cpp | 437 ++++++++++++++++++ 13 files changed, 847 insertions(+), 22 deletions(-) create mode 100644 cpp/include/cudf/detail/copy.cuh create mode 100644 cpp/src/copying/purge_nonempty_nulls.cu create mode 100644 cpp/tests/copying/purge_nonempty_nulls_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0806bb964cf..68008e13897 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -79,6 +79,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh - test -f $PREFIX/include/cudf/detail/concatenate.hpp - test -f $PREFIX/include/cudf/detail/copy.hpp + - test -f $PREFIX/include/cudf/detail/copy.cuh - test -f $PREFIX/include/cudf/detail/datetime.hpp - test -f $PREFIX/include/cudf/detail/fill.hpp - test -f $PREFIX/include/cudf/detail/gather.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 15caaec9bec..cbe2811afe4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -238,6 +238,7 @@ add_library( src/copying/gather.cu src/copying/get_element.cu src/copying/pack.cpp + src/copying/purge_nonempty_nulls.cu src/copying/reverse.cu src/copying/sample.cu src/copying/scatter.cu diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 2e559afef4f..8f1ad7da9b6 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -17,7 +17,10 @@ #pragma once #include +#include #include +#include +#include #include #include @@ -939,5 +942,155 @@ std::unique_ptr sample( int64_t const seed = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Checks if a column or its descendants have non-empty null rows + * + * @note This function is exact. If it returns `true`, there exists one or more + * non-empty null elements. + * + * A LIST or STRING column might have non-empty rows that are marked as null. + * A STRUCT OR LIST column might have child columns that have non-empty null rows. + * Other types of columns are deemed incapable of having non-empty null rows. + * E.g. Fixed width columns have no concept of an "empty" row. + * + * @param input The column which is (and whose descendants are) to be checked for + * non-empty null rows. + * @return true If either the column or its descendants have non-empty null rows. + * @return false If neither the column or its descendants have non-empty null rows. + */ +bool has_nonempty_nulls(column_view const& input); + +/** + * @brief Approximates if a column or its descendants *may* have non-empty null elements + * + * @note This function is approximate. + * - `true`: Non-empty null elements could exist + * - `false`: Non-empty null elements definitely do not exist + * + * False positives are possible, but false negatives are not. + * + * Compared to the exact `has_nonempty_nulls()` function, this function is typically + * more efficient. + * + * Complexity: + * - Best case: `O(count_descendants(input))` + * - Worst case: `O(count_descendants(input)) * m`, where `m` is the number of rows in the largest + * descendant + * + * @param input The column which is (and whose descendants are) to be checked for + * non-empty null rows + * @return true If either the column or its decendants have null rows + * @return false If neither the column nor its descendants have null rows + */ +bool may_have_nonempty_nulls(column_view const& input); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * LIST columns may have non-empty null rows. + * For example: + * @code{.pseudo} + * + * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }.release(); + * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false); + * + * lists[1] is now null, but the lists child column still stores `{2,3}`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [0, 1, 2, 3, 4, 5] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [0, 1, 4, 5] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + lists_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * STRING columns may have non-empty null rows. + * For example: + * @code{.pseudo} + * + * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release(); + * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false); + * + * strings[1] is now null, but the strings column still stores `"CD"`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [A, B, C, D, E, F] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [A, B, E, F] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + strings_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * STRUCTS columns may have null rows, with non-empty child rows. + * For example: + * @code{.pseudo} + * + * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }; + * auto const structs = structs_column_wrapper{ {lists}, null_at(1) }; + * + * structs[1].child is now null, but the lists column still stores `{2,3}`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [0, 1, 2, 3, 4, 5] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [0, 1, 4, 5] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + structs_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ } // namespace cudf diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh new file mode 100644 index 00000000000..773bce7131f --- /dev/null +++ b/cpp/include/cudf/detail/copy.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cudf::detail { + +/** + * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) + * + * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view) + * @param stream CUDA stream used for device memory operations and kernel launches + */ +template +std::unique_ptr purge_nonempty_nulls(ColumnViewT const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Implement via identity gather. + auto const input_column = input.parent(); + auto const gather_begin = thrust::counting_iterator(0); + auto const gather_end = gather_begin + input_column.size(); + + auto gathered_table = cudf::detail::gather(table_view{{input_column}}, + gather_begin, + gather_end, + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(gathered_table->release()[0]); +} + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index 50157d16876..abd14fbda89 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -299,5 +299,22 @@ std::unique_ptr get_element( size_type index, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::has_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +bool has_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::may_have_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +bool may_have_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index c637ad041ba..7df36be2385 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, auto dst_offsets_c = cudf::make_fixed_width_column( data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view(); + auto const source_column_nullmask = source_column.null_mask(); // generate the compacted outgoing offsets. auto count_iter = thrust::make_counting_iterator(0); @@ -90,12 +92,23 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, count_iter, count_iter + offset_count, dst_offsets_v.begin(), - [gather_map, output_count, src_offsets, src_size] __device__(int32_t index) -> int32_t { + [source_column_nullmask, + source_column_offset = source_column.offset(), + gather_map, + output_count, + src_offsets, + src_size] __device__(int32_t index) -> int32_t { int32_t offset_index = index < output_count ? gather_map[index] : 0; // if this is an invalid index, this will be a NULL list if (NullifyOutOfBounds && ((offset_index < 0) || (offset_index >= src_size))) { return 0; } + // If the source row is null, the output row size must be 0. + if (source_column_nullmask != nullptr && + not cudf::bit_is_set(source_column_nullmask, source_column_offset + offset_index)) { + return 0; + } + // the length of this list return src_offsets[offset_index + 1] - src_offsets[offset_index]; }, @@ -110,15 +123,27 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the base offsets rmm::device_uvector base_offsets = rmm::device_uvector(output_count, stream); - thrust::transform(rmm::exec_policy(stream), - gather_map, - gather_map + output_count, - base_offsets.data(), - [src_offsets, src_size, shift] __device__(int32_t index) { - // if this is an invalid index, this will be a NULL list - if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; } - return src_offsets[index] - shift; - }); + thrust::transform( + rmm::exec_policy(stream), + gather_map, + gather_map + output_count, + base_offsets.data(), + [source_column_nullmask, + source_column_offset = source_column.offset(), + src_offsets, + src_size, + shift] __device__(int32_t index) { + // if this is an invalid index, this will be a NULL list + if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; } + + // If the source row is null, the output row size must be 0. + if (source_column_nullmask != nullptr && + not cudf::bit_is_set(source_column_nullmask, source_column_offset + index)) { + return 0; + } + + return src_offsets[index] - shift; + }); // Retrieve size of the resulting gather map for level N+1 (the last offset) size_type child_gather_map_size = diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 1b10c70d6d6..d46ab3a91a1 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -303,14 +303,17 @@ std::unique_ptr gather( data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr); auto const d_out_offsets = out_offsets_column->mutable_view().template data(); auto const d_in_offsets = (strings_count > 0) ? strings.offsets_begin() : nullptr; - thrust::transform(rmm::exec_policy(stream), - begin, - end, - d_out_offsets, - [d_in_offsets, strings_count] __device__(size_type in_idx) { - if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; - return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; - }); + auto const d_strings = column_device_view::create(strings.parent(), stream); + thrust::transform( + rmm::exec_policy(stream), + begin, + end, + d_out_offsets, + [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) { + if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; + if (not d_strings.is_valid(in_idx)) return 0; + return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; + }); // check total size is not too large size_t const total_bytes = thrust::transform_reduce( @@ -329,7 +332,6 @@ std::unique_ptr gather( // build chars column cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto const d_strings = column_device_view::create(strings.parent(), stream); auto out_chars_column = gather_chars(d_strings->begin(), begin, end, diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 329c24cfe0a..ca866d8555e 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,11 @@ class structs_column_view : public column_view { explicit structs_column_view(column_view const& rhs); + /** + * @brief Returns the parent column. + */ + [[nodiscard]] column_view parent() const; + using column_view::child_begin; using column_view::child_end; using column_view::has_nulls; diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu new file mode 100644 index 00000000000..778d6c4df55 --- /dev/null +++ b/cpp/src/copying/purge_nonempty_nulls.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include + +namespace cudf { +namespace detail { + +using cudf::type_id; + +namespace { + +/// Check if nonempty-null checks can be skipped for a given type. +bool type_may_have_nonempty_nulls(cudf::type_id const& type) +{ + return type == type_id::STRING || type == type_id::LIST || type == type_id::STRUCT; +} + +/// Check if the (STRING/LIST) column has any null rows with non-zero length. +bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_view stream) +{ + if (not input.has_nulls()) { return false; } // No nulls => no dirty rows. + + // Cross-reference nullmask and offsets. + auto const type = input.type().id(); + auto const offsets = (type == type_id::STRING) ? (strings_column_view{input}).offsets() + : (lists_column_view{input}).offsets(); + auto const d_input = cudf::column_device_view::create(input); + auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin()] __device__( + size_type const& row_idx) { + return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]); + }; + + auto const row_begin = thrust::counting_iterator(0); + auto const row_end = row_begin + input.size(); + return thrust::count_if(rmm::exec_policy(stream), row_begin, row_end, is_dirty_row) > 0; +} + +} // namespace + +/** + * @copydoc cudf::detail::has_nonempty_nulls + */ +bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view stream) +{ + auto const type = input.type().id(); + + if (not type_may_have_nonempty_nulls(type)) { return false; } + + // For types with variable-length rows, check if any rows are "dirty". + // A dirty row is a null row with non-zero length. + if ((type == type_id::STRING || type == type_id::LIST) && has_nonempty_null_rows(input, stream)) { + return true; + } + + // For complex types, check if child columns need purging. + if ((type == type_id::STRUCT || type == type_id::LIST) && + std::any_of(input.child_begin(), input.child_end(), [stream](auto const& child) { + return cudf::detail::has_nonempty_nulls(child, stream); + })) { + return true; + } + + return false; +} +} // namespace detail + +/** + * @copydoc cudf::may_have_nonempty_nulls + */ +bool may_have_nonempty_nulls(column_view const& input) +{ + auto const type = input.type().id(); + + if (not detail::type_may_have_nonempty_nulls(type)) { return false; } + + if ((type == type_id::STRING || type == type_id::LIST) && input.has_nulls()) { return true; } + + if ((type == type_id::STRUCT || type == type_id::LIST) && + std::any_of(input.child_begin(), input.child_end(), may_have_nonempty_nulls)) { + return true; + } + + return false; +} + +/** + * @copydoc cudf::has_nonempty_nulls + */ +bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); } + +/** + * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(lists_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(structs_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(strings_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index 681f13386ff..7d8c8837d2d 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -25,6 +25,8 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns"); } +column_view structs_column_view::parent() const { return *this; } + column_view structs_column_view::get_sliced_child(int index) const { std::vector children; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e016f47616b..95c54d7596e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -250,6 +250,7 @@ ConfigureTest( copying/gather_tests.cpp copying/get_value_tests.cpp copying/pack_tests.cpp + copying/purge_nonempty_nulls_tests.cpp copying/sample_tests.cpp copying/scatter_tests.cpp copying/scatter_list_tests.cpp diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 4e0e70bf15c..44a79e63cd8 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -645,7 +645,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested) 0, cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *expected); } TYPED_TEST(ListsStructsLeafTest, FromNested) diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp new file mode 100644 index 00000000000..77fd3f66ee5 --- /dev/null +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::test { + +using iterators::no_nulls; +using iterators::null_at; +using iterators::nulls_at; +using T = int32_t; // The actual type of the leaf node isn't really important. +using values_col_t = fixed_width_column_wrapper; +using offsets_col_t = fixed_width_column_wrapper; +using gather_map_t = fixed_width_column_wrapper; + +template +using LCW = cudf::test::lists_column_wrapper; + +struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture { + /// Helper to run gather() on a single column, and extract the single column from the result. + std::unique_ptr gather(column_view const& input, gather_map_t const& gather_map) + { + auto gathered = + cudf::gather(cudf::table_view{{input}}, gather_map, out_of_bounds_policy::NULLIFY); + return std::move(gathered->release()[0]); + } + + /// Verify that the result of `sanitize()` is equivalent to the unsanitized input, + /// except that the null rows are also empty. + template + void test_purge(ColumnViewT const& unpurged) + { + auto const purged = cudf::purge_nonempty_nulls(unpurged); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged.parent(), *purged); + EXPECT_FALSE(cudf::has_nonempty_nulls(*purged)); + } +}; + +// List. +TEST_F(PurgeNonEmptyNullsTest, SingleLevelList) +{ + auto const input = LCW{{{{1, 2, 3, 4}, null_at(2)}, + {5}, + {6, 7}, // <--- Will be set to NULL. Unsanitized row. + {8, 9, 10}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Selecting all rows from input, in different order. + auto const results = gather(input->view(), {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + + auto const expected = LCW{{{5}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + {8, 9, 10}}, + null_at(1)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.child(), + values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects rows preceded by unsanitized rows. + auto const results = gather(input->view(), {3, 100, 0}); + auto const expected = LCW{{ + {8, 9, 10}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + }, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects rows followed by unsanitized rows. + auto const results = gather(input->view(), {1, 100, 0}); + auto const expected = LCW{{ + {5}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + }, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects unsanitized row specifically. + auto const results = gather(input->view(), {2}); + auto const results_lists_view = lists_column_view(*results); + auto const expected = LCW{{ + LCW{} // NULL. + }, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), values_col_t{}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List>. +TEST_F(PurgeNonEmptyNullsTest, TwoLevelList) +{ + auto const input = + LCW{ + {{{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}}, + {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}, + {{21}, {22, 23}, {24, 25, 26}}, + {{31, 32}, {33, 34, 35, 36}, {}, {37, 38}}, //<--- Will be set to NULL. Unsanitized row. + {{41}, {42, 43}}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Verify that gather() output is sanitized. + auto const results = gather(input->view(), {100, 3, 0, 1}); + auto const results_lists_view = lists_column_view(*results); + + auto const expected = LCW{{ + LCW{}, // NULL, because of out of bounds. + LCW{}, // NULL, because input row was null. + {{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}}, // i.e. input[0] + {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}} // i.e. input[1] + }, + nulls_at({0, 1})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_lists_view.child(), + LCW{ + {1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}, {11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}); + + auto const child_lists_view = lists_column_view(results_lists_view.child()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_lists_view.offsets(), + offsets_col_t{0, 3, 7, 8, 10, 11, 13, 16, 19, 20}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + child_lists_view.child(), + values_col_t{1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 11, 12, 13, 14, 15, 16, 17, 18, 19}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List>>. +TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList) +{ + auto const input = LCW{{{{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}}, + {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}}, + {{{21, 21}}, {{22, 23}, {}}, {{24, 25}, {26}}}, + {{{31, 32}, {}}, + {{33, 34, 35}, {36}}, + {}, + {{37, 38}}}, //<--- Will be set to NULL. Unsanitized row. + {{{41, 41, 41}}, {{42, 43}}}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + auto const results = gather(input->view(), {100, 3, 0, 1}); + auto const results_lists_view = lists_column_view(*results); + + auto const expected = LCW{ + { + LCW{}, // NULL, because of out of bounds. + LCW{}, // NULL, because input row was null. + {{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}}, // i.e. input[0] + {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}} // i.e. input[1] + }, + nulls_at({0, 1})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), + LCW{{{1, 2}, {3}}, + {{4, 5}, {6, 7}}, + {{8, 8}, {}}, + {{9, 1}}, + {{2, 3}}, + {{11, 12}}, + {{13}, {14, 15}}, + {{16, 17, 18}}, + {{19, 19}, {}}}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List. +TEST_F(PurgeNonEmptyNullsTest, ListOfStrings) +{ + using T = string_view; + + auto const input = LCW{{{{"1", "22", "", "4444"}, null_at(2)}, + {"55555"}, + {"666666", "7777777"}, // <--- Will be set to NULL. Unsanitized row. + {"88888888", "999999999", "1010101010"}, + {"11", "22", "33", "44"}, + {"55", "66", "77", "88"}}, + no_nulls()} + .release(); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Selecting all rows from input, in different order. + auto const results = gather(input->view(), {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + + auto const expected = LCW{{{"55555"}, + {}, // NULL. + {{"1", "22", "", "4444"}, null_at(2)}, + {"88888888", "999999999", "1010101010"}}, + null_at(1)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_list_view.child(), + strings_column_wrapper{ + {"55555", "1", "22", "", "4444", "88888888", "999999999", "1010101010"}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Gathering from a sliced column. + auto const sliced = cudf::slice({input->view()}, {1, 5})[0]; // Lop off 1 row at each end. + EXPECT_TRUE(cudf::may_have_nonempty_nulls(sliced)); + EXPECT_TRUE(cudf::has_nonempty_nulls(sliced)); + + auto const results = gather(sliced, {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + auto const expected = LCW{{ + {}, + {"88888888", "999999999", "1010101010"}, + {"55555"}, + {"11", "22", "33", "44"}, + }, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 0, 3, 4, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_list_view.child(), + strings_column_wrapper{ + "88888888", "999999999", "1010101010", "55555", "11", "22", "33", "44"}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List. +TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings) +{ + auto strings = + strings_column_wrapper{ + {"1", "22", "3", "44", "5", "66", "7", "8888", "9", "1010"}, //<--- "8888" will be + // unsanitized. + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*strings)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*strings)); + + // Set strings nullmask, post construction. + set_null_mask(strings->mutable_view().null_mask(), 7, 8, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*strings)); + + test_purge(strings_column_view{*strings}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + strings_column_view(*strings).offsets(), offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19} + // 10-14 indicates that "8888" is unsanitized. + ); + + // Construct a list column from the strings column. + auto const lists = make_lists_column(4, + offsets_col_t{0, 4, 5, 7, 10}.release(), + std::move(strings), + 0, + detail::make_null_mask(no_nulls(), no_nulls() + 4)); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); + + // Set lists nullmask, post construction. + cudf::detail::set_null_mask(lists->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); + + test_purge(lists_column_view{*lists}); + + // At this point, + // 1. {"66", "7"} will be unsanitized. + // 2. {"8888", "9", "1010"} will be actually be {NULL, "9", "1010"}. + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + lists_column_view(*lists).offsets(), + offsets_col_t{0, 4, 5, 7, 10}); // 5-7 indicates that list row#2 is unsanitized. + + auto const result = gather(lists->view(), {1, 2, 0, 3}); + auto const expected = LCW{{{"5"}, + {}, // NULL. + {"1", "22", "3", "44"}, + {{"", "9", "1010"}, null_at(0)}}, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + + // Ensure row#2 has been sanitized. + auto const results_lists_view = lists_column_view(*result); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 1, 1, 5, 8} + // 1-1 indicates that row#2 is sanitized. + ); + + // Ensure that "8888" has been sanitized, and stored as "". + auto const child_strings_view = strings_column_view(results_lists_view.child()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_strings_view.offsets(), + offsets_col_t{0, 1, 2, 4, 5, 7, 7, 8, 12}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*result)); +} + +// Struct>. +TEST_F(PurgeNonEmptyNullsTest, StructOfList) +{ + auto const structs_input = + [] { + auto child = LCW{{{{1, 2, 3, 4}, null_at(2)}, + {5}, + {6, 7}, //<--- Unsanitized row. + {8, 9, 10}}, + no_nulls()}; + EXPECT_FALSE(cudf::has_nonempty_nulls(child)); + return structs_column_wrapper{{child}, null_at(2)}; + }() + .release(); + + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*structs_input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*structs_input)); + + test_purge(structs_column_view{*structs_input}); + + // At this point, even though the structs column has a null at index 2, + // the child column has a non-empty list row at index 2: {6, 7}. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lists_column_view(structs_input->child(0)).child(), + values_col_t{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, null_at(2)}); + + { + // Test rearrange. + auto const gather_map = gather_map_t{1, 2, 0, 3}; + auto const result = gather(structs_input->view(), gather_map); + auto const expected_result = [] { + auto child = LCW{{{5}, + LCW{}, //<--- Now, sanitized. + {{1, 2, 3, 4}, null_at(2)}, + {8, 9, 10}}, + null_at(1)}; + return structs_column_wrapper{{child}, null_at(1)}; + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_result); + auto const results_child = lists_column_view(result->child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.child(), + values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*result)); + } +} + +} // namespace cudf::test From 3c208a618f7f3443d021c01ad27f560a7d71e7d7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 29 Apr 2022 09:36:29 -0400 Subject: [PATCH 02/23] Enable pydocstyle rules involving quotes (#10748) This PR enables D30* errors for pydocstyle. It also sets up the `ignore-decorators` configuration so that future PRs involving D10* errors will treat docutils decorators appropriately. Contributes to #10711. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10748 --- .pre-commit-config.yaml | 15 ++++++ python/.flake8 | 24 +++++----- python/cudf/cudf/comm/gpuarrow.py | 4 +- python/cudf/cudf/core/column/string.py | 66 +++++++++++++------------- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/core/series.py | 4 +- 6 files changed, 66 insertions(+), 51 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f690f5f827..cd7b8aea6d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + repos: - repo: https://github.com/PyCQA/isort rev: 5.6.4 @@ -56,6 +58,19 @@ repos: hooks: - id: pydocstyle args: ["--config=python/.flake8"] + exclude: | + (?x)^( + ci| + cpp| + conda| + docs| + java| + notebooks| + python/dask_cudf| + python/cudf_kafka| + python/custreamz| + python/cudf/cudf/tests + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 hooks: diff --git a/python/.flake8 b/python/.flake8 index c645c46a216..667875030cc 100644 --- a/python/.flake8 +++ b/python/.flake8 @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. [flake8] exclude = __init__.py @@ -9,14 +9,14 @@ ignore = E203 [pydocstyle] -match = ^(.*abc\.py|.*api/types\.py|.*single_column_frame\.py|.*indexed_frame\.py)$ -# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather than include using match-dir. -match-dir = ^(?!ci|cpp|python/dask_cudf|python/cudf_kafka|python/custreamz).*$ -# In addition to numpy style, we additionally ignore: -add-ignore = - # magic methods - D105, - # no docstring in __init__ - D107, - # newlines before docstrings - D204 +# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather +# than include using match-dir. Note that as discussed in +# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle, +# unlike the match option above this match-dir will have no effect when +# pydocstyle is invoked from pre-commit. Therefore this exclusion list must +# also be maintained in the pre-commit config file. +match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$ +# Allow missing docstrings for docutils +ignore-decorators = .*(docutils|doc_apply|copy_docstring).* +select = + D30 diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 09b4cc5ffba..0c4d9d7f77e 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -119,12 +119,12 @@ def null(self): @property def data_raw(self): - "Accessor for the data buffer as a device array" + """Accessor for the data buffer as a device array""" return self._series._column.data_array_view @property def null_raw(self): - "Accessor for the null buffer as a device array" + """Accessor for the null buffer as a device array""" return self._series._column.mask_array_view def make_series(self): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1d836d9b759..0db7e7d9a27 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -201,7 +201,7 @@ def __getitem__(self, key): return self.get(key) def len(self) -> SeriesOrIndex: - """ + r""" Computes the length of each element in the Series/Index. Returns @@ -213,7 +213,7 @@ def len(self) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(["dog", "", "\\n", None]) + >>> s = cudf.Series(["dog", "", "\n", None]) >>> s.str.len() 0 3 1 0 @@ -960,7 +960,7 @@ def replace( ) def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: - """ + r""" Use the ``repl`` back-ref template to create a new string with the extracted elements found using the ``pat`` expression. @@ -980,7 +980,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: -------- >>> import cudf >>> s = cudf.Series(["A543","Z756"]) - >>> s.str.replace_with_backrefs('(\\\\d)(\\\\d)', 'V\\\\2\\\\1') + >>> s.str.replace_with_backrefs('(\\d)(\\d)', 'V\\2\\1') 0 AV453 1 ZV576 dtype: object @@ -1195,7 +1195,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex: ) def isfloat(self) -> SeriesOrIndex: - """ + r""" Check whether all characters in each string form floating value. If a string has zero characters, False is returned for @@ -1249,7 +1249,7 @@ def isfloat(self) -> SeriesOrIndex: 4 True 5 False dtype: bool - >>> s = cudf.Series(["this is plain text", "\\t\\n", "9.9", "9.9.9"]) + >>> s = cudf.Series(["this is plain text", "\t\n", "9.9", "9.9.9"]) >>> s.str.isfloat() 0 False 1 False @@ -2239,7 +2239,7 @@ def get(self, i: int = 0) -> SeriesOrIndex: return self._return_or_inplace(libstrings.get(self._column, i)) def get_json_object(self, json_path): - """ + r""" Applies a JSONPath string to an input strings column where each row in the column is a valid json string @@ -2258,7 +2258,7 @@ def get_json_object(self, json_path): >>> import cudf >>> s = cudf.Series( [ - \\"\\"\\" + \"\"\" { "store":{ "book":[ @@ -2277,13 +2277,13 @@ def get_json_object(self, json_path): ] } } - \\"\\"\\" + \"\"\" ]) >>> s - 0 {"store": {\\n "book": [\\n { "cat... + 0 {"store": {\n "book": [\n { "cat... dtype: object >>> s.str.get_json_object("$.store.book") - 0 [\\n { "category": "reference",\\n ... + 0 [\n { "category": "reference",\n ... dtype: object """ @@ -3138,7 +3138,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: ) def strip(self, to_strip: str = None) -> SeriesOrIndex: - """ + r""" Remove leading and trailing characters. Strip whitespaces (including newlines) or a set of @@ -3169,11 +3169,11 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\\n', '3. Cat?\\t', None]) + >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) >>> s 0 1. Ant. - 1 2. Bee!\\n - 2 3. Cat?\\t + 1 2. Bee!\n + 2 3. Cat?\t 3 dtype: object >>> s.str.strip() @@ -3182,7 +3182,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: 2 3. Cat? 3 dtype: object - >>> s.str.strip('123.!? \\n\\t') + >>> s.str.strip('123.!? \n\t') 0 Ant 1 Bee 2 Cat @@ -3197,7 +3197,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: ) def lstrip(self, to_strip: str = None) -> SeriesOrIndex: - """ + r""" Remove leading and trailing characters. Strip whitespaces (including newlines) @@ -3228,11 +3228,11 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\\n', '3. Cat?\\t', None]) + >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) >>> s.str.lstrip('123.') 0 Ant. - 1 Bee!\\n - 2 Cat?\\t + 1 Bee!\n + 2 Cat?\t 3 dtype: object """ @@ -3244,7 +3244,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex: ) def rstrip(self, to_strip: str = None) -> SeriesOrIndex: - """ + r""" Remove leading and trailing characters. Strip whitespaces (including newlines) @@ -3277,14 +3277,14 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\\n', '3. Cat?\\t', None]) + >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) >>> s 0 1. Ant. - 1 2. Bee!\\n - 2 3. Cat?\\t + 1 2. Bee!\n + 2 3. Cat?\t 3 dtype: object - >>> s.str.rstrip('.!? \\n\\t') + >>> s.str.rstrip('.!? \n\t') 0 1. Ant 1 2. Bee 2 3. Cat @@ -3299,7 +3299,7 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex: ) def wrap(self, width: int, **kwargs) -> SeriesOrIndex: - """ + r""" Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. @@ -3340,8 +3340,8 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: >>> data = ['line to be wrapped', 'another line to be wrapped'] >>> s = cudf.Series(data) >>> s.str.wrap(12) - 0 line to be\\nwrapped - 1 another line\\nto be\\nwrapped + 0 line to be\nwrapped + 1 another line\nto be\nwrapped dtype: object """ if not is_integer(width): @@ -3575,7 +3575,7 @@ def isempty(self) -> SeriesOrIndex: return self._return_or_inplace((self._column == "").fillna(False)) def isspace(self) -> SeriesOrIndex: - """ + r""" Check whether all characters in each string are whitespace. This is equivalent to running the Python string method @@ -3623,7 +3623,7 @@ def isspace(self) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series([' ', '\\t\\r\\n ', '']) + >>> s = cudf.Series([' ', '\t\r\n ', '']) >>> s.str.isspace() 0 True 1 True @@ -4271,7 +4271,7 @@ def normalize_spaces(self) -> SeriesOrIndex: ) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: - """ + r""" Normalizes strings characters for tokenizing. This uses the normalizer that is built into the @@ -4280,7 +4280,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" - adding padding around the CJK Unicode block characters - - changing whitespace (e.g. ``\\t``, ``\\n``, ``\\r``) to space + - changing whitespace (e.g. ``\t``, ``\n``, ``\r``) to space - removing control characters (unicode categories "Cc" and "Cf") If `do_lower_case = true`, lower-casing also removes the accents. @@ -4303,7 +4303,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> ser = cudf.Series(["héllo, \\tworld","ĂĆCĖÑTED","$99"]) + >>> ser = cudf.Series(["héllo, \tworld","ĂĆCĖÑTED","$99"]) >>> ser.str.normalize_characters() 0 hello , world 1 accented diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 104ed3eeb67..d0e9e6d94c1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3356,7 +3356,7 @@ def to_dlpack(self): @_cudf_nvtx_annotate def to_string(self): - """ + r""" Convert to string cuDF uses Pandas internals for efficient string formatting. @@ -3373,7 +3373,7 @@ def to_string(self): >>> df['key'] = [0, 1, 2] >>> df['val'] = [float(i + 10) for i in range(3)] >>> df.to_string() - ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' + ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' """ return repr(self) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ff671509a0..d813db58d1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4614,13 +4614,13 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): @_cudf_nvtx_annotate def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): - """Returns a boolean array where two arrays are equal within a tolerance. + r"""Returns a boolean array where two arrays are equal within a tolerance. Two values in ``a`` and ``b`` are considered equal when the following equation is satisfied. .. math:: - |a - b| \\le \\mathrm{atol} + \\mathrm{rtol} |b| + |a - b| \le \mathrm{atol} + \mathrm{rtol} |b| Parameters ---------- From 15e49824a8cb2a5a7ec6a6e5f273589a66f1c120 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 29 Apr 2022 10:22:10 -0500 Subject: [PATCH 03/23] Enable pydocstyle for all packages. (#10759) Follow-up to #10748 to enable the base pydocstyle rules on all Python packages (`dask_cudf`, `cudf_kafka`, `custreamz`) and test files. Contributes to #10711, #10758. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10759 --- .pre-commit-config.yaml | 6 +----- python/.flake8 | 2 +- python/custreamz/custreamz/kafka.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cd7b8aea6d7..46d5223f7d3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,11 +65,7 @@ repos: conda| docs| java| - notebooks| - python/dask_cudf| - python/cudf_kafka| - python/custreamz| - python/cudf/cudf/tests + notebooks ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 diff --git a/python/.flake8 b/python/.flake8 index 667875030cc..b763c209fc1 100644 --- a/python/.flake8 +++ b/python/.flake8 @@ -15,7 +15,7 @@ ignore = # unlike the match option above this match-dir will have no effect when # pydocstyle is invoked from pre-commit. Therefore this exclusion list must # also be maintained in the pre-commit config file. -match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$ +match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$ # Allow missing docstrings for docutils ignore-decorators = .*(docutils|doc_apply|copy_docstring).* select = diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index f5d5031602f..0198757c68d 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -95,7 +95,7 @@ def read_gdf( message_format="json", ): - """ + r""" Read messages from the underlying KafkaDatasource connection and create a cudf Dataframe From 3c4e72e68d9406d65939b7d2fdf28b0b921840dd Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 29 Apr 2022 21:24:12 +0530 Subject: [PATCH 04/23] Add row hasher with nested column support (#10641) Contributes to #10186 Authors: - Devavret Makkar (https://github.com/devavret) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10641 --- cpp/benchmarks/stream_compaction/distinct.cpp | 41 +++ cpp/include/cudf/detail/hashing.hpp | 5 +- cpp/include/cudf/detail/iterator.cuh | 8 +- .../cudf/detail/utilities/algorithm.cuh | 28 ++ cpp/include/cudf/detail/utilities/column.hpp | 10 +- .../cudf/table/experimental/row_operators.cuh | 273 +++++++++++++++--- cpp/src/hash/hashing.cu | 29 +- cpp/src/hash/murmur_hash.cu | 28 +- cpp/src/stream_compaction/distinct.cu | 18 +- .../stream_compaction_common.cuh | 22 ++ cpp/src/table/row_operators.cu | 60 ++-- cpp/tests/hashing/hash_test.cpp | 224 +++++++++++++- cpp/tests/reductions/list_rank_test.cpp | 4 +- .../stream_compaction/distinct_tests.cpp | 242 ++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 2 +- 15 files changed, 880 insertions(+), 114 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/algorithm.cuh diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 749badc715d..149c6ad7219 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +template +void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) +{ + cudf::rmm_pool_raii pool_raii; + + auto const size = state.get_int64("ColumnSize"); + auto const dtype = cudf::type_to_id(); + double const null_frequency = state.get_float64("null_frequency"); + + data_profile table_data_profile; + if (dtype == cudf::type_id::LIST) { + table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4); + table_data_profile.set_distribution_params( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4); + table_data_profile.set_list_depth(1); + } else { + // We're comparing distinct() on a non-nested column to that on a list column with the same + // number of distinct rows. The max list size is 4 and the number of distinct values in the + // list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781 + // We want this column to also have 781 distinct values. + table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781); + } + table_data_profile.set_null_frequency(null_frequency); + + auto const table = create_random_table( + {dtype}, table_size_bytes{static_cast(size)}, table_data_profile, 0); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view); + }); +} + +NVBENCH_BENCH_TYPES(nvbench_distinct_list, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("distinct_list") + .set_type_axes_names({"Type"}) + .add_float64_axis("null_frequency", {0.0, 0.1}) + .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index e8e100aaec5..9958fa8f3a4 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -33,19 +33,20 @@ namespace detail { std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, - uint32_t seed = 0, + uint32_t seed = cudf::DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, + uint32_t seed = cudf::DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); template