From 4229fd61c7978a13efd71fad75adad1a3d63a60f Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Sun, 17 Jan 2021 11:54:17 -0800 Subject: [PATCH 01/24] JUNCO: Working prototype: No min-periods check yet. --- cpp/tests/CMakeLists.txt | 8 ++ cpp/tests/collect_list/collect_list_test.cu | 144 ++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 cpp/tests/collect_list/collect_list_test.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2592b5e4221..ccb9a8dd917 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -492,6 +492,14 @@ set(LEAD_LAG_TEST_SRC ConfigureTest(LEAD_LAG_TEST "${LEAD_LAG_TEST_SRC}") +################################################################################################### +# - collect_list rolling tests --------------------------------------------------------------------------------- + +set(COLLECT_LIST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/collect_list/collect_list_test.cu") + +ConfigureTest(COLLECT_LIST_TEST "${COLLECT_LIST_SRC}") + ################################################################################################### # - filling test ---------------------------------------------------------------------------------- diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu new file mode 100644 index 00000000000..6aa470cb5a0 --- /dev/null +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +struct CollectListTest : public cudf::test::BaseFixture {}; + +#include +#include +#include +#include +#include + +cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) +{ + // Number of rows in child-column == last offset value. + cudf::size_type num_child_rows{}; + CUDA_TRY(cudaMemcpyAsync(&num_child_rows, + list_offsets.data() + list_offsets.size() - 1, + sizeof(cudf::size_type), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + return num_child_rows; +} + +void print(std::string const& msg, cudf::column_view col) +{ + std::cout << msg << std::endl; + cudf::test::print(col); + std::cout << std::endl; +} + +void print(std::string const& msg, std::unique_ptr const& col) +{ + print(msg, col->view()); +} + +void foo() +{ + using namespace cudf; + using namespace cudf::test; + + auto size_data_type = data_type{type_to_id()}; + + auto ints_column = fixed_width_column_wrapper{70,71,72,73,74}; + + auto prev_column = fixed_width_column_wrapper{1,2,2,2,2}; + auto foll_column = fixed_width_column_wrapper{1,1,1,1,0}; + + CUDF_EXPECTS(static_cast(prev_column).size() == static_cast(foll_column).size(), ""); + + auto sizes = cudf::binary_operation(prev_column, foll_column, binary_operator::ADD, data_type{size_data_type}); + auto offsets = cudf::strings::detail::make_offsets_child_column(sizes->view().begin(), sizes->view().end()); + print("Offsets:", offsets); + + // Bail if offsets.size() < 2; + + auto scatter_map = make_fixed_width_column(size_data_type, offsets->size()-2); + thrust::copy(thrust::device, offsets->view().begin()+1, offsets->view().end()-1, scatter_map->mutable_view().begin()); + print("Scatter_map: ", scatter_map); + + auto scatter_input = make_fixed_width_column(size_data_type, offsets->size()-2); + thrust::fill_n(thrust::device, scatter_input->mutable_view().begin(), offsets->size()-2, size_type{1}); + + auto num_child_rows = get_num_child_rows(offsets->view()); + + auto child_index_column_input = make_fixed_width_column(size_data_type , num_child_rows); + thrust::fill_n(thrust::device, child_index_column_input->mutable_view().begin(), num_child_rows, 0); + + thrust::scatter(thrust::device, scatter_input->view().begin(), scatter_input->view().end(), + scatter_map->view().begin(), child_index_column_input->mutable_view().begin()); + + auto per_row_group_mapping = make_fixed_width_column(size_data_type, num_child_rows); + thrust::inclusive_scan(thrust::device, + child_index_column_input->view().begin(), + child_index_column_input->view().end(), + per_row_group_mapping->mutable_view().begin()); + print("Per_row_group_mapping: ", per_row_group_mapping); + + auto gather_map = make_fixed_width_column(size_data_type, num_child_rows); + thrust::for_each_n( + thrust::device, + thrust::make_counting_iterator(0), + num_child_rows, + [d_offsets = offsets->view().begin(), // [0, 2, 5, 8, 11, 13] + d_groups = per_row_group_mapping->view().begin(), // [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] + d_prev = static_cast(prev_column).data(), + d_output = gather_map->mutable_view().begin()] + __device__(auto i) { + auto group = d_groups[i]; + auto group_start_offset = d_offsets[group]; + auto relative_index = i - group_start_offset; + + d_output[i] = (group - d_prev[group] + 1) + relative_index; + } + ); + print("Gather_map: ", gather_map); + + auto input_columns = std::vector>{}; + input_columns.emplace_back(std::make_unique(ints_column)); + auto input_table = cudf::table{std::move(input_columns)}; + + auto output_table = cudf::gather(input_table.view(), gather_map->view()); + + print("Gathered column: ", output_table->get_column(0).view()); + +} + +TEST_F(CollectListTest, ProofOfConcept) +{ + foo(); +} + +CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file From ca0f6516a29a0e9bf9b1e9de7c23201749e40697 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Sun, 17 Jan 2021 15:51:19 -0800 Subject: [PATCH 02/24] WIP: Got offsets. --- cpp/src/rolling/rolling_detail.cuh | 44 ++++++++++++++++++++- cpp/src/rolling/rolling_detail.hpp | 16 +++++--- cpp/tests/collect_list/collect_list_test.cu | 20 ++++++++++ 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index e0ae16bed7f..02425af27c7 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -310,7 +311,7 @@ template ::value and !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL || op == aggregation::ROW_NUMBER || op == aggregation::LEAD || - op == aggregation::LAG)>* = nullptr> + op == aggregation::LAG || op == aggregation::COLLECT)>* = nullptr> bool __device__ process_rolling_window(column_device_view input, column_device_view ignored_default_outputs, mutable_column_device_view output, @@ -814,7 +815,7 @@ struct rolling_window_launcher { template - std::enable_if_t> operator()(column_view const& input, column_view const& default_outputs, @@ -895,6 +896,45 @@ struct rolling_window_launcher { stream, mr); } + + template + std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> + operator()(column_view const& input, + column_view const& default_outputs, + PrecedingWindowIterator preceding_window_begin, + FollowingWindowIterator following_window_begin, + size_type min_periods, + std::unique_ptr const& agg, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + // COLLECT() should be supported on all data types. + // Output column must be of type `list`. + + // FIXME: min_periods not yet supported: + // 1. Short term: Construct null-mask based on (list.size() >= min_periods) + // 2. Long term: Reduce list sizes to zero, for null rows. + + using namespace cudf; + if (input.is_empty()) return empty_like(input); + + // Materialize offsets column. + auto size_data_type = data_type{type_to_id()}; + auto sizes = make_fixed_width_column(size_data_type, input.size()); + auto mutable_sizes = sizes->mutable_view(); + thrust::transform(thrust::device, + preceding_window_begin, + preceding_window_begin + input.size(), + following_window_begin, + mutable_sizes.begin(), + [] __device__(auto preceding, auto following) { return preceding + following; } + ); + auto offsets = cudf::strings::detail::make_offsets_child_column(sizes->view().begin(), + sizes->view().end()); + return offsets; + } }; struct dispatch_rolling { diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp index ed136405a79..b471711dc37 100644 --- a/cpp/src/rolling/rolling_detail.hpp +++ b/cpp/src/rolling/rolling_detail.hpp @@ -41,7 +41,7 @@ static constexpr bool is_rolling_supported() (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG); + (op == aggregation::LAG) or (op == aggregation::COLLECT); constexpr bool is_valid_numeric_agg = (cudf::is_numeric() or cudf::is_duration() or @@ -53,17 +53,23 @@ static constexpr bool is_rolling_supported() } else if (cudf::is_timestamp() || cudf::is_fixed_point()) { return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or (op == aggregation::LAG); + (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or (op == aggregation::LAG) or + (op == aggregation::COLLECT); } else if (std::is_same()) { return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER); + (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT); } else if (std::is_same()) { return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER); - } else + (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT); + } else if (std::is_same()) { + // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER. + return op == aggregation::COLLECT; + } + else { return false; + } } // return true if this Op is specialized for strings. diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index 6aa470cb5a0..28cbd19197f 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -141,4 +141,24 @@ TEST_F(CollectListTest, ProofOfConcept) foo(); } +TEST_F(CollectListTest, Integration) +{ + using namespace cudf; + using namespace cudf::test; + + auto size_data_type = data_type{type_to_id()}; + + auto ints_column = fixed_width_column_wrapper{70,71,72,73,74}; + + auto prev_column = fixed_width_column_wrapper{1,2,2,2,2}; + auto foll_column = fixed_width_column_wrapper{1,1,1,1,0}; + + CUDF_EXPECTS(static_cast(prev_column).size() == static_cast(foll_column).size(), ""); + + auto sizes = cudf::rolling_window(ints_column, prev_column, foll_column, 1, make_collect_aggregation()); + + print("Sizes: ", *sizes); + +} + CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file From 5b639c59448952e5049ba0d333241fbd54e7b8be Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 18 Jan 2021 21:26:19 -0800 Subject: [PATCH 03/24] WIP: Got child/input mapping. --- cpp/src/rolling/rolling_detail.cuh | 110 ++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 18 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 02425af27c7..1ac420e1abd 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -896,20 +896,61 @@ struct rolling_window_launcher { stream, mr); } - + + // TODO: Move to lists/utilities. + cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) + { + // Number of rows in child-column == last offset value. + cudf::size_type num_child_rows{}; + CUDA_TRY(cudaMemcpyAsync(&num_child_rows, + list_offsets.data() + list_offsets.size() - 1, + sizeof(cudf::size_type), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + return num_child_rows; + } + + template + std::unique_ptr get_collect_list_offsets(column_view const& input, + PrecedingIter preceding_begin, + FollowingIter following_begin, + size_type min_periods, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + // Materialize offsets column. + auto size_data_type = data_type{type_to_id()}; + auto sizes = make_fixed_width_column(size_data_type, input.size()); + auto mutable_sizes = sizes->mutable_view(); + thrust::transform(thrust::device, + preceding_begin, + preceding_begin + input.size(), + following_begin, + mutable_sizes.begin(), + [] __device__(auto preceding, auto following) { return preceding + following; } + ); + return cudf::strings::detail::make_offsets_child_column(sizes->view().begin(), + sizes->view().end(), stream, mr); + } + template + typename PrecedingIter, + typename FollowingIter> std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> operator()(column_view const& input, column_view const& default_outputs, - PrecedingWindowIterator preceding_window_begin, - FollowingWindowIterator following_window_begin, + PrecedingIter preceding_begin, + FollowingIter following_begin, size_type min_periods, std::unique_ptr const& agg, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(default_outputs.is_empty(), + "Only LEAD/LAG window functions support default values."); + // COLLECT() should be supported on all data types. // Output column must be of type `list`. @@ -918,22 +959,55 @@ struct rolling_window_launcher { // 2. Long term: Reduce list sizes to zero, for null rows. using namespace cudf; + using namespace cudf::detail; + if (input.is_empty()) return empty_like(input); - // Materialize offsets column. + // Materialize collect list's offsets. + auto offsets = get_collect_list_offsets(input, preceding_begin, following_begin, min_periods, stream, mr); + auto size_data_type = data_type{type_to_id()}; - auto sizes = make_fixed_width_column(size_data_type, input.size()); - auto mutable_sizes = sizes->mutable_view(); - thrust::transform(thrust::device, - preceding_window_begin, - preceding_window_begin + input.size(), - following_window_begin, - mutable_sizes.begin(), - [] __device__(auto preceding, auto following) { return preceding + following; } - ); - auto offsets = cudf::strings::detail::make_offsets_child_column(sizes->view().begin(), - sizes->view().end()); - return offsets; + + // Generate list child's mapping to parent list. + // If + // input list == [A,B,C,D,E] + // and preceding == [1,2,2,2,2], + // and following == [1,1,1,1,0], + // then, + // result == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ] + // i.e. result offset column == [0,2,5,8,11,13], + // and result child column == [A,B,A,B,C,B,C,D,C,D,E,D,E]. + // Mapping back to `input` == [0,1,0,1,2,1,2,3,2,3,4,3,4] + + // 1. Scatter `1` to all offsets except the first and last. + auto num_child_rows = get_num_child_rows(offsets->view()); + auto scatter_map = make_fixed_width_column(size_data_type, offsets->size()-2); + thrust::copy(thrust::device, + offsets->view().template begin()+1, + offsets->view().template end()-1, + scatter_map->mutable_view().template begin()); + auto scatter_values = make_fixed_width_column(size_data_type, offsets->size()-2); + thrust::fill_n(thrust::device, + scatter_values->mutable_view().template begin(), + offsets->size()-2, + size_type{1}); // [1,1,1,1,...1] + auto scatter_output = make_fixed_width_column(size_data_type , num_child_rows); + thrust::fill_n(thrust::device, + scatter_output->mutable_view().template begin(), + num_child_rows, + 0); // [0,0,0,...0] + thrust::scatter(thrust::device, + scatter_values->view().template begin(), + scatter_values->view().template end(), + scatter_map->view().template begin(), + scatter_output->mutable_view().template begin()); // [0,1,0,0,1,...] + auto per_row_mapping = make_fixed_width_column(size_data_type, num_child_rows); + thrust::inclusive_scan(thrust::device, + scatter_output->view().template begin(), + scatter_output->view().template end(), + per_row_mapping->mutable_view().template begin()); + + return per_row_mapping; } }; From 3170c18f3acd865fbb6c8c16b3cfdaad20748167 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 19 Jan 2021 12:17:55 -0800 Subject: [PATCH 04/24] WIP: Got gather map working. --- cpp/src/rolling/rolling_detail.cuh | 145 +++++++++++++++++++---------- 1 file changed, 97 insertions(+), 48 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 1ac420e1abd..2d8a0f56ffc 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -920,9 +920,11 @@ struct rolling_window_launcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + using namespace cudf; + // Materialize offsets column. auto size_data_type = data_type{type_to_id()}; - auto sizes = make_fixed_width_column(size_data_type, input.size()); + auto sizes = make_fixed_width_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream, mr); auto mutable_sizes = sizes->mutable_view(); thrust::transform(thrust::device, preceding_begin, @@ -931,8 +933,72 @@ struct rolling_window_launcher { mutable_sizes.begin(), [] __device__(auto preceding, auto following) { return preceding + following; } ); - return cudf::strings::detail::make_offsets_child_column(sizes->view().begin(), - sizes->view().end(), stream, mr); + return strings::detail::make_offsets_child_column(sizes->view().begin(), + sizes->view().end(), stream, mr); + } + + /** + * @brief Generate collect() list child's mapping to input column. + * + * If + * input col == [A,B,C,D,E] + * and preceding == [1,2,2,2,2], + * and following == [1,1,1,1,0], + * then, + * collect result == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ] + * i.e. result offset column == [0,2,5,8,11,13], + * and result child column == [A,B,A,B,C,B,C,D,C,D,E,D,E]. + * Mapping back to `input` == [0,1,0,1,2,1,2,3,2,3,4,3,4] + */ + std::unique_ptr get_list_child_to_input_mapping(cudf::column_view const& offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + using namespace cudf; + + auto size_data_type = data_type{type_to_id()}; + + // First, scatter `1` to all offsets except the first and last, + // into a column of N `0`s, where N == number of child rows. + // For the example above: + // offsets == [0, 2, 5, 8, 11, 13] + // scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0] + // + auto num_child_rows = get_num_child_rows(offsets); + auto scatter_values = make_fixed_width_column(size_data_type, + offsets.size()-2, + mask_state::UNALLOCATED, + stream, + mr); + thrust::fill_n(thrust::device, + scatter_values->mutable_view().template begin(), + offsets.size()-2, + size_type{1}); // [1,1,1,1,...1] + auto scatter_output = make_fixed_width_column(size_data_type, + num_child_rows, + mask_state::UNALLOCATED, + stream, + mr); + thrust::fill_n(thrust::device, + scatter_output->mutable_view().template begin(), + num_child_rows, + 0); // [0,0,0,...0] + thrust::scatter(thrust::device, + scatter_values->view().template begin(), + scatter_values->view().template end(), + offsets.template begin() + 1, + scatter_output->mutable_view().template begin()); // [0,0,1,0,0,1,...] + + // Next, generate mapping with inclusive_scan() on scatter() result. + // For the example above: + // scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0] + // inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4] + auto per_row_mapping = make_fixed_width_column(size_data_type, num_child_rows); + thrust::inclusive_scan(thrust::device, + scatter_output->view().template begin(), + scatter_output->view().template end(), + per_row_mapping->mutable_view().template begin()); + return per_row_mapping; } template `. @@ -964,50 +1030,33 @@ struct rolling_window_launcher { if (input.is_empty()) return empty_like(input); // Materialize collect list's offsets. - auto offsets = get_collect_list_offsets(input, preceding_begin, following_begin, min_periods, stream, mr); - - auto size_data_type = data_type{type_to_id()}; - - // Generate list child's mapping to parent list. - // If - // input list == [A,B,C,D,E] - // and preceding == [1,2,2,2,2], - // and following == [1,1,1,1,0], - // then, - // result == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ] - // i.e. result offset column == [0,2,5,8,11,13], - // and result child column == [A,B,A,B,C,B,C,D,C,D,E,D,E]. - // Mapping back to `input` == [0,1,0,1,2,1,2,3,2,3,4,3,4] - - // 1. Scatter `1` to all offsets except the first and last. - auto num_child_rows = get_num_child_rows(offsets->view()); - auto scatter_map = make_fixed_width_column(size_data_type, offsets->size()-2); - thrust::copy(thrust::device, - offsets->view().template begin()+1, - offsets->view().template end()-1, - scatter_map->mutable_view().template begin()); - auto scatter_values = make_fixed_width_column(size_data_type, offsets->size()-2); - thrust::fill_n(thrust::device, - scatter_values->mutable_view().template begin(), - offsets->size()-2, - size_type{1}); // [1,1,1,1,...1] - auto scatter_output = make_fixed_width_column(size_data_type , num_child_rows); - thrust::fill_n(thrust::device, - scatter_output->mutable_view().template begin(), - num_child_rows, - 0); // [0,0,0,...0] - thrust::scatter(thrust::device, - scatter_values->view().template begin(), - scatter_values->view().template end(), - scatter_map->view().template begin(), - scatter_output->mutable_view().template begin()); // [0,1,0,0,1,...] - auto per_row_mapping = make_fixed_width_column(size_data_type, num_child_rows); - thrust::inclusive_scan(thrust::device, - scatter_output->view().template begin(), - scatter_output->view().template end(), - per_row_mapping->mutable_view().template begin()); - - return per_row_mapping; + auto offsets = get_collect_list_offsets(input, + preceding_begin, + following_begin, + min_periods, + stream, + mr); + + auto per_row_mapping = get_list_child_to_input_mapping(offsets->view(), stream, mr); + + auto gather_map = make_fixed_width_column(data_type{type_to_id()}, per_row_mapping->size()); + thrust::for_each_n( + thrust::device, + thrust::make_counting_iterator(0), + per_row_mapping->size(), + [d_offsets = offsets->view().template begin(), // [0, 2, 5, 8, 11, 13] + d_groups = per_row_mapping->view().template begin(), // [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] + d_prev = preceding_begin, + d_output = gather_map->mutable_view().template begin()] + __device__(auto i) { + auto group = d_groups[i]; + auto group_start_offset = d_offsets[group]; + auto relative_index = i - group_start_offset; + + d_output[i] = (group - d_prev[group] + 1) + relative_index; + } + ); + return gather_map; } }; From f0a208e5f045fd1ca998ae50fa627ee405aa105c Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 19 Jan 2021 15:43:47 -0800 Subject: [PATCH 05/24] Working. No min_periods handling yet. Minimal test. --- cpp/src/rolling/rolling_detail.cuh | 76 +++++++++--- cpp/tests/collect_list/collect_list_test.cu | 130 +++----------------- 2 files changed, 75 insertions(+), 131 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 2d8a0f56ffc..ec43de566e4 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -963,7 +963,7 @@ struct rolling_window_launcher { // For the example above: // offsets == [0, 2, 5, 8, 11, 13] // scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0] - // + auto num_child_rows = get_num_child_rows(offsets); auto scatter_values = make_fixed_width_column(size_data_type, offsets.size()-2, @@ -993,7 +993,11 @@ struct rolling_window_launcher { // For the example above: // scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0] // inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4] - auto per_row_mapping = make_fixed_width_column(size_data_type, num_child_rows); + auto per_row_mapping = make_fixed_width_column(size_data_type, + num_child_rows, + mask_state::UNALLOCATED, + stream, + mr); thrust::inclusive_scan(thrust::device, scatter_output->view().template begin(), scatter_output->view().template end(), @@ -1001,6 +1005,37 @@ struct rolling_window_launcher { return per_row_mapping; } + template + std::unique_ptr get_gather_map_for_child_column(column_view const& child_offsets, + column_view const& per_row_mapping, + PrecedingIter preceding_iter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto gather_map = make_fixed_width_column(data_type{type_to_id()}, + per_row_mapping.size(), + mask_state::UNALLOCATED, + stream, + mr); + thrust::for_each_n( // Convert to transform(). + thrust::device, + thrust::make_counting_iterator(0), + per_row_mapping.size(), + [d_offsets = child_offsets.template begin(), // E.g. [0, 2, 5, 8, 11, 13] + d_groups = per_row_mapping.template begin(), // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] + d_prev = preceding_iter, + d_output = gather_map->mutable_view().template begin()] + __device__(auto i) { + auto group = d_groups[i]; + auto group_start_offset = d_offsets[group]; + auto relative_index = i - group_start_offset; + + d_output[i] = (group - d_prev[group] + 1) + relative_index; + } + ); + return gather_map; + } + template @@ -1037,26 +1072,27 @@ struct rolling_window_launcher { stream, mr); + // Map each element of the collect() result's child column + // to the index where it appears in the input. auto per_row_mapping = get_list_child_to_input_mapping(offsets->view(), stream, mr); - auto gather_map = make_fixed_width_column(data_type{type_to_id()}, per_row_mapping->size()); - thrust::for_each_n( - thrust::device, - thrust::make_counting_iterator(0), - per_row_mapping->size(), - [d_offsets = offsets->view().template begin(), // [0, 2, 5, 8, 11, 13] - d_groups = per_row_mapping->view().template begin(), // [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] - d_prev = preceding_begin, - d_output = gather_map->mutable_view().template begin()] - __device__(auto i) { - auto group = d_groups[i]; - auto group_start_offset = d_offsets[group]; - auto relative_index = i - group_start_offset; - - d_output[i] = (group - d_prev[group] + 1) + relative_index; - } - ); - return gather_map; + // Generate gather map to produce the collect() result's child column. + auto gather_map = get_gather_map_for_child_column(offsets->view(), + per_row_mapping->view(), + preceding_begin, + stream, + mr); + + // gather(), to construct child column. + auto gather_output = cudf::gather(table_view{std::vector{input}}, gather_map->view()); + + return make_lists_column(input.size(), + std::move(offsets), + std::move(gather_output->release()[0]), + 0, + {}, + stream, + mr); } }; diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index 28cbd19197f..cbc9371fd03 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -31,134 +31,42 @@ #include #include + struct CollectListTest : public cudf::test::BaseFixture {}; -#include -#include -#include -#include -#include +template +struct TypedCollectListTest : public CollectListTest {}; -cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) -{ - // Number of rows in child-column == last offset value. - cudf::size_type num_child_rows{}; - CUDA_TRY(cudaMemcpyAsync(&num_child_rows, - list_offsets.data() + list_offsets.size() - 1, - sizeof(cudf::size_type), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - return num_child_rows; -} +using TypesForTest = cudf::test::Concat; -void print(std::string const& msg, cudf::column_view col) -{ - std::cout << msg << std::endl; - cudf::test::print(col); - std::cout << std::endl; -} +TYPED_TEST_CASE(TypedCollectListTest, TypesForTest); -void print(std::string const& msg, std::unique_ptr const& col) -{ - print(msg, col->view()); -} - -void foo() +TYPED_TEST(TypedCollectListTest, NoNulls) { using namespace cudf; using namespace cudf::test; - auto size_data_type = data_type{type_to_id()}; + using T = TypeParam; - auto ints_column = fixed_width_column_wrapper{70,71,72,73,74}; + auto ints_column = fixed_width_column_wrapper{70,71,72,73,74}; auto prev_column = fixed_width_column_wrapper{1,2,2,2,2}; auto foll_column = fixed_width_column_wrapper{1,1,1,1,0}; - CUDF_EXPECTS(static_cast(prev_column).size() == static_cast(foll_column).size(), ""); - - auto sizes = cudf::binary_operation(prev_column, foll_column, binary_operator::ADD, data_type{size_data_type}); - auto offsets = cudf::strings::detail::make_offsets_child_column(sizes->view().begin(), sizes->view().end()); - print("Offsets:", offsets); - - // Bail if offsets.size() < 2; - - auto scatter_map = make_fixed_width_column(size_data_type, offsets->size()-2); - thrust::copy(thrust::device, offsets->view().begin()+1, offsets->view().end()-1, scatter_map->mutable_view().begin()); - print("Scatter_map: ", scatter_map); - - auto scatter_input = make_fixed_width_column(size_data_type, offsets->size()-2); - thrust::fill_n(thrust::device, scatter_input->mutable_view().begin(), offsets->size()-2, size_type{1}); - - auto num_child_rows = get_num_child_rows(offsets->view()); - - auto child_index_column_input = make_fixed_width_column(size_data_type , num_child_rows); - thrust::fill_n(thrust::device, child_index_column_input->mutable_view().begin(), num_child_rows, 0); - - thrust::scatter(thrust::device, scatter_input->view().begin(), scatter_input->view().end(), - scatter_map->view().begin(), child_index_column_input->mutable_view().begin()); - - auto per_row_group_mapping = make_fixed_width_column(size_data_type, num_child_rows); - thrust::inclusive_scan(thrust::device, - child_index_column_input->view().begin(), - child_index_column_input->view().end(), - per_row_group_mapping->mutable_view().begin()); - print("Per_row_group_mapping: ", per_row_group_mapping); - - auto gather_map = make_fixed_width_column(size_data_type, num_child_rows); - thrust::for_each_n( - thrust::device, - thrust::make_counting_iterator(0), - num_child_rows, - [d_offsets = offsets->view().begin(), // [0, 2, 5, 8, 11, 13] - d_groups = per_row_group_mapping->view().begin(), // [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] - d_prev = static_cast(prev_column).data(), - d_output = gather_map->mutable_view().begin()] - __device__(auto i) { - auto group = d_groups[i]; - auto group_start_offset = d_offsets[group]; - auto relative_index = i - group_start_offset; - - d_output[i] = (group - d_prev[group] + 1) + relative_index; - } - ); - print("Gather_map: ", gather_map); - - auto input_columns = std::vector>{}; - input_columns.emplace_back(std::make_unique(ints_column)); - auto input_table = cudf::table{std::move(input_columns)}; - - auto output_table = cudf::gather(input_table.view(), gather_map->view()); - - print("Gathered column: ", output_table->get_column(0).view()); - -} - -TEST_F(CollectListTest, ProofOfConcept) -{ - foo(); -} - -TEST_F(CollectListTest, Integration) -{ - using namespace cudf; - using namespace cudf::test; - - auto size_data_type = data_type{type_to_id()}; - - auto ints_column = fixed_width_column_wrapper{70,71,72,73,74}; - - auto prev_column = fixed_width_column_wrapper{1,2,2,2,2}; - auto foll_column = fixed_width_column_wrapper{1,1,1,1,0}; + EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); - CUDF_EXPECTS(static_cast(prev_column).size() == static_cast(foll_column).size(), ""); + auto result = cudf::rolling_window(ints_column, prev_column, foll_column, 1, make_collect_aggregation()); - auto sizes = cudf::rolling_window(ints_column, prev_column, foll_column, 1, make_collect_aggregation()); + auto expected_result = lists_column_wrapper{ + {70, 71}, + {70, 71, 72}, + {71, 72, 73}, + {72, 73, 74}, + {73, 74}, + }.release(); - print("Sizes: ", *sizes); - + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file From d8b834b891cb67392965b5f20d7f371cf9d82405 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 19 Jan 2021 21:50:54 -0800 Subject: [PATCH 06/24] WIP: Added fixup for rolling_window() iterators. --- cpp/src/rolling/rolling_detail.cuh | 22 +++++++-- cpp/tests/collect_list/collect_list_test.cu | 53 ++++++++++++++++----- 2 files changed, 61 insertions(+), 14 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index ec43de566e4..8b3c15b1a47 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -946,7 +946,7 @@ struct rolling_window_launcher { * and following == [1,1,1,1,0], * then, * collect result == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ] - * i.e. result offset column == [0,2,5,8,11,13], + * i.e. result offset column == [0,2,5,8,11,13], * and result child column == [A,B,A,B,C,B,C,D,C,D,E,D,E]. * Mapping back to `input` == [0,1,0,1,2,1,2,3,2,3,4,3,4] */ @@ -1042,8 +1042,8 @@ struct rolling_window_launcher { std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> operator()(column_view const& input, column_view const& default_outputs, - PrecedingIter preceding_begin, - FollowingIter following_begin, + PrecedingIter preceding_begin_raw, + FollowingIter following_begin_raw, size_type min_periods, std::unique_ptr const& agg, rmm::cuda_stream_view stream, @@ -1064,6 +1064,22 @@ struct rolling_window_launcher { if (input.is_empty()) return empty_like(input); + // Fix up preceding/following iterators to respect column boundaries, + // similar to gpu_rolling(). + // `rolling_window()` does not fix up preceding/following so as not to read past + // column boundaries. + // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do. + auto preceding_begin + = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [preceding_begin_raw] __device__(auto i) { + return thrust::min(preceding_begin_raw[i], i+1); + }); + auto following_begin + = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [following_begin_raw, size = input.size()] __device__(auto i) { + return thrust::min(following_begin_raw[i], size - i - 1); + }); + // Materialize collect list's offsets. auto offsets = get_collect_list_offsets(input, preceding_begin, diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index cbc9371fd03..827419bb20b 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -37,35 +37,66 @@ struct CollectListTest : public cudf::test::BaseFixture {}; template struct TypedCollectListTest : public CollectListTest {}; -using TypesForTest = cudf::test::Concat; +using TypesForTest = cudf::test::Concat; TYPED_TEST_CASE(TypedCollectListTest, TypesForTest); -TYPED_TEST(TypedCollectListTest, NoNulls) +TYPED_TEST(TypedCollectListTest, BasicRollingWindowNoNulls) { using namespace cudf; using namespace cudf::test; using T = TypeParam; - auto ints_column = fixed_width_column_wrapper{70,71,72,73,74}; + auto input_column = fixed_width_column_wrapper{10,11,12,13,14}; auto prev_column = fixed_width_column_wrapper{1,2,2,2,2}; auto foll_column = fixed_width_column_wrapper{1,1,1,1,0}; EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); - auto result = cudf::rolling_window(ints_column, prev_column, foll_column, 1, make_collect_aggregation()); + auto result_column_based_window = rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation()); - auto expected_result = lists_column_wrapper{ - {70, 71}, - {70, 71, 72}, - {71, 72, 73}, - {72, 73, 74}, - {73, 74}, + auto expected_result = lists_column_wrapper{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {12, 13, 14}, + {13, 14}, }.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); + + auto result_fixed_window = rolling_window(input_column, 2, 1, 1, make_collect_aggregation()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view()); +} + +TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowNoNulls) +{ + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + + auto result = grouped_rolling_window(table_view{std::vector{group_column}}, input_column, 2, 1, 1, make_collect_aggregation()); + + auto expected_result = lists_column_wrapper{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {12, 13, 14}, + {13, 14}, + {20, 21}, + {20, 21, 22}, + {21, 22, 23}, + {22, 23} + }.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } From 2568341087a520f5b8d571f111231aeafd65f07c Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 20 Jan 2021 01:07:54 -0800 Subject: [PATCH 07/24] WIP: Added support for empty lists in result. --- cpp/src/rolling/rolling_detail.cuh | 38 ++++++++++++++++----- cpp/tests/collect_list/collect_list_test.cu | 29 ++++++++++++++++ 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 8b3c15b1a47..527bcba1527 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -958,22 +958,38 @@ struct rolling_window_launcher { auto size_data_type = data_type{type_to_id()}; - // First, scatter `1` to all offsets except the first and last, + // First, reduce offsets column by key, to identify the number of times + // an offset appears. + // Next, scatter the count for each offset (except the first and last), // into a column of N `0`s, where N == number of child rows. // For the example above: // offsets == [0, 2, 5, 8, 11, 13] // scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0] + // + // If the above example had an empty list row at index 2, + // the same columns would look as follows: + // offsets == [0, 2, 5, 5, 8, 11, 13] + // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] auto num_child_rows = get_num_child_rows(offsets); + auto scatter_values = make_fixed_width_column(size_data_type, - offsets.size()-2, + offsets.size(), mask_state::UNALLOCATED, stream, mr); - thrust::fill_n(thrust::device, - scatter_values->mutable_view().template begin(), - offsets.size()-2, - size_type{1}); // [1,1,1,1,...1] + auto scatter_keys = make_fixed_width_column(size_data_type, + offsets.size(), + mask_state::UNALLOCATED, + stream, + mr); + auto reduced_by_key = thrust::reduce_by_key(thrust::device, + offsets.template begin(), + offsets.template end(), + thrust::make_constant_iterator(1), + scatter_keys->mutable_view().template begin(), + scatter_values->mutable_view().template begin()); + auto scatter_values_end = reduced_by_key.second; auto scatter_output = make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, @@ -984,15 +1000,19 @@ struct rolling_window_launcher { num_child_rows, 0); // [0,0,0,...0] thrust::scatter(thrust::device, - scatter_values->view().template begin(), - scatter_values->view().template end(), - offsets.template begin() + 1, + scatter_values->mutable_view().template begin() + 1, + scatter_values_end, + scatter_keys->view().template begin() + 1, scatter_output->mutable_view().template begin()); // [0,0,1,0,0,1,...] // Next, generate mapping with inclusive_scan() on scatter() result. // For the example above: // scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0] // inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4] + // + // For the case with an empty list at index 3: + // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] + // inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5] auto per_row_mapping = make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index 827419bb20b..3609de716a7 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -73,6 +73,35 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindowNoNulls) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view()); } +TEST_F(CollectListTest, RollingWindowWithEmptyLists) +{ + using namespace cudf; + using namespace cudf::test; + + using TypeParam = int32_t; + using T = TypeParam; + + auto input_column = fixed_width_column_wrapper{10,11,12,13,14,15}; + + auto prev_column = fixed_width_column_wrapper{1,2,2,0,2,2}; + auto foll_column = fixed_width_column_wrapper{1,1,1,0,1,0}; + + EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); + + auto result_column_based_window = rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + + auto expected_result = lists_column_wrapper{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {}, + {13, 14, 15}, + {14, 15}, + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); +} + TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowNoNulls) { using namespace cudf; From db732082ef2d7dfb3fd6b32c9cc9a1b34db942a6 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 20 Jan 2021 12:08:39 -0800 Subject: [PATCH 08/24] WIP: Switch for_each_n() to transform() --- cpp/src/rolling/rolling_detail.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 527bcba1527..7a1fd6b0d58 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1037,20 +1037,20 @@ struct rolling_window_launcher { mask_state::UNALLOCATED, stream, mr); - thrust::for_each_n( // Convert to transform(). + thrust::transform( thrust::device, thrust::make_counting_iterator(0), - per_row_mapping.size(), + thrust::make_counting_iterator(per_row_mapping.size()), + gather_map->mutable_view().template begin(), [d_offsets = child_offsets.template begin(), // E.g. [0, 2, 5, 8, 11, 13] d_groups = per_row_mapping.template begin(), // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] - d_prev = preceding_iter, - d_output = gather_map->mutable_view().template begin()] + d_prev = preceding_iter] __device__(auto i) { auto group = d_groups[i]; auto group_start_offset = d_offsets[group]; auto relative_index = i - group_start_offset; - d_output[i] = (group - d_prev[group] + 1) + relative_index; + return (group - d_prev[group] + 1) + relative_index; } ); return gather_map; From 871a4ae64c5e5201251342eec3a8600eb2bf7732 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 20 Jan 2021 19:48:26 -0800 Subject: [PATCH 09/24] WIP: Support for min_periods checks. Also, fixed null/empty list representation. --- cpp/src/rolling/rolling_detail.cuh | 48 ++++++++++++++--- cpp/tests/collect_list/collect_list_test.cu | 57 ++++++++++++++++++++- 2 files changed, 96 insertions(+), 9 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 7a1fd6b0d58..1c0efa0df39 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -931,12 +932,36 @@ struct rolling_window_launcher { preceding_begin + input.size(), following_begin, mutable_sizes.begin(), - [] __device__(auto preceding, auto following) { return preceding + following; } - ); + [min_periods] __device__(auto preceding, auto following) { + return (preceding + following) < min_periods ? 0 : (preceding + following); + }); return strings::detail::make_offsets_child_column(sizes->view().begin(), sizes->view().end(), stream, mr); } + template + std::pair get_collect_list_null_mask(column_view const& input, + PrecedingIter preceding_iter, + FollowingIter following_iter, + size_type min_periods, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + rmm::device_buffer null_mask; + size_type null_count; + std::tie(null_mask, null_count) = valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + [preceding_iter, following_iter, min_periods] + __device__(auto i) { + return (preceding_iter[i] + following_iter[i]) >= min_periods; + }, + stream, + mr); + return (null_count == 0) + ? std::make_pair(rmm::device_buffer{0, stream, mr}, size_type{0}) + : std::make_pair(null_mask, null_count); + } + /** * @brief Generate collect() list child's mapping to input column. * @@ -984,7 +1009,7 @@ struct rolling_window_launcher { stream, mr); auto reduced_by_key = thrust::reduce_by_key(thrust::device, - offsets.template begin(), + offsets.template begin() + 1, // Skip first 0 in offsets. offsets.template end(), thrust::make_constant_iterator(1), scatter_keys->mutable_view().template begin(), @@ -1000,9 +1025,9 @@ struct rolling_window_launcher { num_child_rows, 0); // [0,0,0,...0] thrust::scatter(thrust::device, - scatter_values->mutable_view().template begin() + 1, + scatter_values->mutable_view().template begin(), scatter_values_end, - scatter_keys->view().template begin() + 1, + scatter_keys->view().template begin(), scatter_output->mutable_view().template begin()); // [0,0,1,0,0,1,...] // Next, generate mapping with inclusive_scan() on scatter() result. @@ -1122,11 +1147,20 @@ struct rolling_window_launcher { // gather(), to construct child column. auto gather_output = cudf::gather(table_view{std::vector{input}}, gather_map->view()); + rmm::device_buffer null_mask; + size_type null_count; + std::tie(null_mask, null_count) = get_collect_list_null_mask(input, + preceding_begin, + following_begin, + min_periods, + stream, + mr); + return make_lists_column(input.size(), std::move(offsets), std::move(gather_output->release()[0]), - 0, - {}, + null_count, + std::move(null_mask), stream, mr); } diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index 3609de716a7..81a73c61003 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -73,12 +73,11 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindowNoNulls) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view()); } -TEST_F(CollectListTest, RollingWindowWithEmptyLists) +TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyLists) { using namespace cudf; using namespace cudf::test; - using TypeParam = int32_t; using T = TypeParam; auto input_column = fixed_width_column_wrapper{10,11,12,13,14,15}; @@ -102,6 +101,60 @@ TEST_F(CollectListTest, RollingWindowWithEmptyLists) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); } +TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyListsAtEnds) +{ + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto input_column = fixed_width_column_wrapper{0,1,2,3,4,5}; + auto num_elements = static_cast(input_column).size(); + + auto prev_column = fixed_width_column_wrapper{0,2,2,2,2,0}; + auto foll_column = fixed_width_column_wrapper{0,1,1,1,1,0}; + + auto result = rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + + auto expected_result = lists_column_wrapper{ + {}, + {0, 1, 2}, + {1, 2, 3}, + {2, 3, 4}, + {3, 4, 5}, + {} + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + +TYPED_TEST(TypedCollectListTest, RollingWindowWithNullLists) +{ + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto input_column = fixed_width_column_wrapper{0,1,2,3,4,5}; + auto num_elements = static_cast(input_column).size(); + + auto result = rolling_window(input_column, 2, 1, 3, make_collect_aggregation()); + + auto expected_result = lists_column_wrapper{ + { + {}, + {0, 1, 2}, + {1, 2, 3}, + {2, 3, 4}, + {3, 4, 5}, + {} + }, + make_counting_transform_iterator(0, [num_elements](auto i) { return i != 0 && i != (num_elements - 1); }) + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowNoNulls) { using namespace cudf; From 3fbaf3e0186614a95eb2b4b86f2bbd13c890112a Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 20 Jan 2021 20:37:55 -0800 Subject: [PATCH 10/24] WIP: Clarify how empty lists are handled... ... at the beginning of the output. --- cpp/src/rolling/rolling_detail.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 1c0efa0df39..073941d4230 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -995,6 +995,12 @@ struct rolling_window_launcher { // the same columns would look as follows: // offsets == [0, 2, 5, 5, 8, 11, 13] // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] + // + // Note: To correctly handle null list rows at the beginning of + // the output column, care must be taken to skip the first `0` + // in the offsets column, when running `reduce_by_key()`. + // This accounts for the `0` added by default to the offsets + // column, marking the beginning of the column. auto num_child_rows = get_num_child_rows(offsets); From 219cb5d53cfffbb134bee885c43e9dd86ca30269 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 20 Jan 2021 22:57:02 -0800 Subject: [PATCH 11/24] WIP: Tests! --- cpp/tests/collect_list/collect_list_test.cu | 348 +++++++++++++++++++- 1 file changed, 340 insertions(+), 8 deletions(-) diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index 81a73c61003..3f8019029d8 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -43,7 +43,7 @@ using TypesForTest = cudf::test::Concat{0,1,2,3,4,5}; auto num_elements = static_cast(input_column).size(); - auto result = rolling_window(input_column, 2, 1, 3, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto result = rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); auto expected_result = lists_column_wrapper{ { @@ -149,25 +152,54 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullLists) {3, 4, 5}, {} }, - make_counting_transform_iterator(0, [num_elements](auto i) { return i != 0 && i != (num_elements - 1); }) + make_counting_transform_iterator(0, + [num_elements](auto i) { return i != 0 && i != (num_elements - 1); }) }.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); + + preceding = 2; + following = 2; + min_periods = 4; + + auto result_2 = rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto expected_result_2 = lists_column_wrapper{ + { + {}, + {0, 1, 2, 3}, + {1, 2, 3, 4}, + {2, 3, 4, 5}, + {}, + {} + }, + make_counting_transform_iterator(0, + [num_elements](auto i) { return i != 0 && i < 4; }) + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view()); } -TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowNoNulls) +TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) { using namespace cudf; using namespace cudf::test; using T = TypeParam; - auto group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; - auto result = grouped_rolling_window(table_view{std::vector{group_column}}, input_column, 2, 1, 1, make_collect_aggregation()); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); - auto expected_result = lists_column_wrapper{ + auto const expected_result = lists_column_wrapper{ {10, 11}, {10, 11, 12}, {11, 12, 13}, @@ -182,4 +214,304 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowNoNulls) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } +TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) +{ + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto const time_column = fixed_width_column_wrapper + { 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = grouped_time_range_rolling_window( + table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto const expected_result = lists_column_wrapper{ + {10, 11, 12, 13}, + {10, 11, 12, 13}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {20}, + {21, 22}, + {21, 22, 23}, + {21, 22, 23} + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + +TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) +{ + using namespace cudf; + using namespace cudf::test; + + using T = cudf::string_view; + + auto const time_column = fixed_width_column_wrapper + { 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = grouped_time_range_rolling_window( + table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto const expected_result = lists_column_wrapper{ + {"10", "11", "12", "13"}, + {"10", "11", "12", "13"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"20"}, + {"21", "22"}, + {"21", "22", "23"}, + {"21", "22", "23"} + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + +TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) +{ + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto const time_column = fixed_width_column_wrapper + { 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto numeric_member_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + auto string_member_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; + auto struct_members = std::vector>{}; + struct_members.emplace_back(numeric_member_column.release()); + struct_members.emplace_back(string_member_column.release()); + auto const struct_column = make_structs_column(9, + std::move(struct_members), + 0, + {}); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = grouped_time_range_rolling_window( + table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto expected_numeric_column = fixed_width_column_wrapper{ + 10, 11, 12, 13, + 10, 11, 12, 13, + 10, 11, 12, 13, 14, + 10, 11, 12, 13, 14, + 10, 11, 12, 13, 14, + 20, + 21, 22, + 21, 22, 23, + 21, 22, 23 + }; + + auto expected_string_column = strings_column_wrapper{ + "10", "11", "12", "13", + "10", "11", "12", "13", + "10", "11", "12", "13", "14", + "10", "11", "12", "13", "14", + "10", "11", "12", "13", "14", + "20", + "21", "22", + "21", "22", "23", + "21", "22", "23" + }; + + auto expected_struct_members = std::vector>{}; + expected_struct_members.emplace_back(expected_numeric_column.release()); + expected_struct_members.emplace_back(expected_string_column.release()); + + auto expected_structs_column = make_structs_column(32, std::move(expected_struct_members), 0, {}); + auto expected_offsets_column = fixed_width_column_wrapper{0, 4, 8, 13, 18, 23, 24, 26, 29, 32}.release(); + auto expected_result = make_lists_column(9, std::move(expected_offsets_column), std::move(expected_structs_column), 0, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + +TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto const time_column = fixed_width_column_wrapper + { 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = grouped_time_range_rolling_window( + table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto const expected_result = lists_column_wrapper{ + { + {10, 11, 12, 13}, + {10, 11, 12, 13}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {}, + {}, + {}, + {} + }, + make_counting_transform_iterator(0, [](auto i){ return i < 5; }) + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + +TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + using T = cudf::string_view; + + auto const time_column = fixed_width_column_wrapper + { 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = grouped_time_range_rolling_window( + table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto const expected_result = lists_column_wrapper{ + { + {"10", "11", "12", "13"}, + {"10", "11", "12", "13"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {}, + {}, + {}, + {} + }, + make_counting_transform_iterator(0, [](auto i){ return i < 5; }) + }.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + +TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto const time_column = fixed_width_column_wrapper + { 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto numeric_member_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + auto string_member_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; + auto struct_members = std::vector>{}; + struct_members.emplace_back(numeric_member_column.release()); + struct_members.emplace_back(string_member_column.release()); + auto const struct_column = make_structs_column(9, + std::move(struct_members), + 0, + {}); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = grouped_time_range_rolling_window( + table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto expected_numeric_column = fixed_width_column_wrapper{ + 10, 11, 12, 13, + 10, 11, 12, 13, + 10, 11, 12, 13, 14, + 10, 11, 12, 13, 14, + 10, 11, 12, 13, 14 + }; + + auto expected_string_column = strings_column_wrapper{ + "10", "11", "12", "13", + "10", "11", "12", "13", + "10", "11", "12", "13", "14", + "10", "11", "12", "13", "14", + "10", "11", "12", "13", "14" + }; + + auto expected_struct_members = std::vector>{}; + expected_struct_members.emplace_back(expected_numeric_column.release()); + expected_struct_members.emplace_back(expected_string_column.release()); + + auto expected_structs_column = make_structs_column(23, std::move(expected_struct_members), 0, {}); + auto expected_offsets_column = fixed_width_column_wrapper{0, 4, 8, 13, 18, 23, 23, 23, 23, 23}.release(); + auto expected_validity_iter = make_counting_transform_iterator(0, [](auto i) {return i < 5; }); + auto expected_null_mask = cudf::test::detail::make_null_mask(expected_validity_iter, expected_validity_iter + 9); + auto expected_result = make_lists_column(9, + std::move(expected_offsets_column), + std::move(expected_structs_column), + 4, + std::move(expected_null_mask)); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file From 507239c06f75dfca2abaaceb4dca83857253c5dd Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Thu, 21 Jan 2021 16:30:37 -0800 Subject: [PATCH 12/24] WIP: Moved get_num_child_rows() to utilities --- cpp/include/cudf/lists/detail/scatter.cuh | 23 +---------- cpp/include/cudf/lists/detail/utilities.cuh | 46 +++++++++++++++++++++ cpp/src/rolling/rolling_detail.cuh | 29 ++----------- 3 files changed, 51 insertions(+), 47 deletions(-) create mode 100644 cpp/include/cudf/lists/detail/utilities.cuh diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 53f0472fedc..15f975c4eb7 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -160,28 +161,6 @@ rmm::device_uvector list_vector_from_column( return vector; } -/** - * @brief Fetch the number of rows in a lists column's child given its offsets column. - * - * @param list_offsets Offsets child of a lists column - * @param stream The cuda-stream to synchronize on, when reading from device memory - * @return cudf::size_type The last element in the list_offsets column, indicating - * the number of rows in the lists-column's child. - */ -cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, - rmm::cuda_stream_view stream) -{ - // Number of rows in child-column == last offset value. - cudf::size_type num_child_rows{}; - CUDA_TRY(cudaMemcpyAsync(&num_child_rows, - list_offsets.data() + list_offsets.size() - 1, - sizeof(cudf::size_type), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - return num_child_rows; -} - /** * @brief Constructs null mask for a scattered list's child column * diff --git a/cpp/include/cudf/lists/detail/utilities.cuh b/cpp/include/cudf/lists/detail/utilities.cuh new file mode 100644 index 00000000000..e52b806506f --- /dev/null +++ b/cpp/include/cudf/lists/detail/utilities.cuh @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +/** + * @brief Fetch the number of rows in a lists column's child given its offsets column. + * + * @param[in] list_offsets Offsets child of a lists column + * @param[in] stream The cuda-stream to synchronize on, when reading from device memory + * @return cudf::size_type The number of child rows in the lists column + */ +static cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, + rmm::cuda_stream_view stream) +{ + // Number of rows in child-column == last offset value. + cudf::size_type num_child_rows{}; + CUDA_TRY(cudaMemcpyAsync(&num_child_rows, + list_offsets.data() + list_offsets.size() - 1, + sizeof(cudf::size_type), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + return num_child_rows; +} + +} // namespace detail; +} // namespace cudf; diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 073941d4230..818d6209e81 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -898,21 +899,6 @@ struct rolling_window_launcher { mr); } - // TODO: Move to lists/utilities. - cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) - { - // Number of rows in child-column == last offset value. - cudf::size_type num_child_rows{}; - CUDA_TRY(cudaMemcpyAsync(&num_child_rows, - list_offsets.data() + list_offsets.size() - 1, - sizeof(cudf::size_type), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - return num_child_rows; - } - template std::unique_ptr get_collect_list_offsets(column_view const& input, PrecedingIter preceding_begin, @@ -975,7 +961,7 @@ struct rolling_window_launcher { * and result child column == [A,B,A,B,C,B,C,D,C,D,E,D,E]. * Mapping back to `input` == [0,1,0,1,2,1,2,3,2,3,4,3,4] */ - std::unique_ptr get_list_child_to_input_mapping(cudf::column_view const& offsets, + std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view const& offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -1002,7 +988,7 @@ struct rolling_window_launcher { // This accounts for the `0` added by default to the offsets // column, marking the beginning of the column. - auto num_child_rows = get_num_child_rows(offsets); + auto num_child_rows = get_num_child_rows(offsets, stream); auto scatter_values = make_fixed_width_column(size_data_type, offsets.size(), @@ -1103,13 +1089,6 @@ struct rolling_window_launcher { CUDF_EXPECTS(default_outputs.is_empty(), "COLLECT window function does not support default values."); - // COLLECT() should be supported on all data types. - // Output column must be of type `list`. - - // FIXME: min_periods not yet supported: - // 1. Short term: Construct null-mask based on (list.size() >= min_periods) - // 2. Long term: Reduce list sizes to zero, for null rows. - using namespace cudf; using namespace cudf::detail; @@ -1141,7 +1120,7 @@ struct rolling_window_launcher { // Map each element of the collect() result's child column // to the index where it appears in the input. - auto per_row_mapping = get_list_child_to_input_mapping(offsets->view(), stream, mr); + auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr); // Generate gather map to produce the collect() result's child column. auto gather_map = get_gather_map_for_child_column(offsets->view(), From aec5ae127dcd6d18bf8bc5d302f214d75fad53b1 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Thu, 21 Jan 2021 16:48:04 -0800 Subject: [PATCH 13/24] Code formatting. --- cpp/include/cudf/lists/detail/utilities.cuh | 6 +- cpp/src/rolling/rolling_detail.cuh | 229 +++++----- cpp/src/rolling/rolling_detail.hpp | 11 +- cpp/tests/collect_list/collect_list_test.cu | 483 +++++++++----------- 4 files changed, 338 insertions(+), 391 deletions(-) diff --git a/cpp/include/cudf/lists/detail/utilities.cuh b/cpp/include/cudf/lists/detail/utilities.cuh index e52b806506f..ccee9b0d5d9 100644 --- a/cpp/include/cudf/lists/detail/utilities.cuh +++ b/cpp/include/cudf/lists/detail/utilities.cuh @@ -29,7 +29,7 @@ namespace detail { * @return cudf::size_type The number of child rows in the lists column */ static cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream) { // Number of rows in child-column == last offset value. cudf::size_type num_child_rows{}; @@ -42,5 +42,5 @@ static cudf::size_type get_num_child_rows(cudf::column_view const& list_offsets, return num_child_rows; } -} // namespace detail; -} // namespace cudf; +} // namespace detail +} // namespace cudf diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 818d6209e81..51a6392672d 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -817,7 +817,8 @@ struct rolling_window_launcher { template - std::enable_if_t> operator()(column_view const& input, column_view const& default_outputs, @@ -911,66 +912,67 @@ struct rolling_window_launcher { // Materialize offsets column. auto size_data_type = data_type{type_to_id()}; - auto sizes = make_fixed_width_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream, mr); + auto sizes = + make_fixed_width_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream, mr); auto mutable_sizes = sizes->mutable_view(); - thrust::transform(thrust::device, + thrust::transform(thrust::device, preceding_begin, preceding_begin + input.size(), following_begin, mutable_sizes.begin(), - [min_periods] __device__(auto preceding, auto following) { + [min_periods] __device__(auto preceding, auto following) { return (preceding + following) < min_periods ? 0 : (preceding + following); }); - return strings::detail::make_offsets_child_column(sizes->view().begin(), - sizes->view().end(), stream, mr); + return strings::detail::make_offsets_child_column( + sizes->view().begin(), sizes->view().end(), stream, mr); } template - std::pair get_collect_list_null_mask(column_view const& input, - PrecedingIter preceding_iter, - FollowingIter following_iter, - size_type min_periods, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::pair get_collect_list_null_mask( + column_view const& input, + PrecedingIter preceding_iter, + FollowingIter following_iter, + size_type min_periods, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_buffer null_mask; size_type null_count; - std::tie(null_mask, null_count) = valid_if(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - [preceding_iter, following_iter, min_periods] - __device__(auto i) { - return (preceding_iter[i] + following_iter[i]) >= min_periods; - }, - stream, - mr); - return (null_count == 0) - ? std::make_pair(rmm::device_buffer{0, stream, mr}, size_type{0}) - : std::make_pair(null_mask, null_count); + std::tie(null_mask, null_count) = valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + [preceding_iter, following_iter, min_periods] __device__(auto i) { + return (preceding_iter[i] + following_iter[i]) >= min_periods; + }, + stream, + mr); + return (null_count == 0) ? std::make_pair(rmm::device_buffer{0, stream, mr}, size_type{0}) + : std::make_pair(null_mask, null_count); } /** * @brief Generate collect() list child's mapping to input column. - * - * If + * + * If * input col == [A,B,C,D,E] * and preceding == [1,2,2,2,2], * and following == [1,1,1,1,0], - * then, + * then, * collect result == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ] * i.e. result offset column == [0,2,5,8,11,13], * and result child column == [A,B,A,B,C,B,C,D,C,D,E,D,E]. * Mapping back to `input` == [0,1,0,1,2,1,2,3,2,3,4,3,4] */ - std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view const& offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view const& offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace cudf; auto size_data_type = data_type{type_to_id()}; // First, reduce offsets column by key, to identify the number of times - // an offset appears. + // an offset appears. // Next, scatter the count for each offset (except the first and last), // into a column of N `0`s, where N == number of child rows. // For the example above: @@ -981,8 +983,8 @@ struct rolling_window_launcher { // the same columns would look as follows: // offsets == [0, 2, 5, 5, 8, 11, 13] // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] - // - // Note: To correctly handle null list rows at the beginning of + // + // Note: To correctly handle null list rows at the beginning of // the output column, care must be taken to skip the first `0` // in the offsets column, when running `reduce_by_key()`. // This accounts for the `0` added by default to the offsets @@ -990,37 +992,30 @@ struct rolling_window_launcher { auto num_child_rows = get_num_child_rows(offsets, stream); - auto scatter_values = make_fixed_width_column(size_data_type, - offsets.size(), - mask_state::UNALLOCATED, - stream, - mr); - auto scatter_keys = make_fixed_width_column(size_data_type, - offsets.size(), - mask_state::UNALLOCATED, - stream, - mr); - auto reduced_by_key = thrust::reduce_by_key(thrust::device, - offsets.template begin() + 1, // Skip first 0 in offsets. - offsets.template end(), - thrust::make_constant_iterator(1), - scatter_keys->mutable_view().template begin(), - scatter_values->mutable_view().template begin()); + auto scatter_values = + make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr); + auto scatter_keys = + make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr); + auto reduced_by_key = + thrust::reduce_by_key(thrust::device, + offsets.template begin() + 1, // Skip first 0 in offsets. + offsets.template end(), + thrust::make_constant_iterator(1), + scatter_keys->mutable_view().template begin(), + scatter_values->mutable_view().template begin()); auto scatter_values_end = reduced_by_key.second; - auto scatter_output = make_fixed_width_column(size_data_type, - num_child_rows, - mask_state::UNALLOCATED, - stream, - mr); - thrust::fill_n(thrust::device, - scatter_output->mutable_view().template begin(), - num_child_rows, - 0); // [0,0,0,...0] - thrust::scatter(thrust::device, - scatter_values->mutable_view().template begin(), - scatter_values_end, - scatter_keys->view().template begin(), - scatter_output->mutable_view().template begin()); // [0,0,1,0,0,1,...] + auto scatter_output = + make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr); + thrust::fill_n(thrust::device, + scatter_output->mutable_view().template begin(), + num_child_rows, + 0); // [0,0,0,...0] + thrust::scatter( + thrust::device, + scatter_values->mutable_view().template begin(), + scatter_values_end, + scatter_keys->view().template begin(), + scatter_output->mutable_view().template begin()); // [0,0,1,0,0,1,...] // Next, generate mapping with inclusive_scan() on scatter() result. // For the example above: @@ -1030,11 +1025,8 @@ struct rolling_window_launcher { // For the case with an empty list at index 3: // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] // inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5] - auto per_row_mapping = make_fixed_width_column(size_data_type, - num_child_rows, - mask_state::UNALLOCATED, - stream, - mr); + auto per_row_mapping = + make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr); thrust::inclusive_scan(thrust::device, scatter_output->view().template begin(), scatter_output->view().template end(), @@ -1049,42 +1041,40 @@ struct rolling_window_launcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto gather_map = make_fixed_width_column(data_type{type_to_id()}, + auto gather_map = make_fixed_width_column(data_type{type_to_id()}, per_row_mapping.size(), mask_state::UNALLOCATED, stream, mr); thrust::transform( - thrust::device, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(per_row_mapping.size()), - gather_map->mutable_view().template begin(), - [d_offsets = child_offsets.template begin(), // E.g. [0, 2, 5, 8, 11, 13] - d_groups = per_row_mapping.template begin(), // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] - d_prev = preceding_iter] - __device__(auto i) { - auto group = d_groups[i]; - auto group_start_offset = d_offsets[group]; - auto relative_index = i - group_start_offset; - - return (group - d_prev[group] + 1) + relative_index; - } - ); + thrust::device, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(per_row_mapping.size()), + gather_map->mutable_view().template begin(), + [d_offsets = + child_offsets.template begin(), // E.g. [0, 2, 5, 8, 11, 13] + d_groups = + per_row_mapping.template begin(), // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4] + d_prev = preceding_iter] __device__(auto i) { + auto group = d_groups[i]; + auto group_start_offset = d_offsets[group]; + auto relative_index = i - group_start_offset; + + return (group - d_prev[group] + 1) + relative_index; + }); return gather_map; } - template - std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> - operator()(column_view const& input, - column_view const& default_outputs, - PrecedingIter preceding_begin_raw, - FollowingIter following_begin_raw, - size_type min_periods, - std::unique_ptr const& agg, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + template + std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> operator()( + column_view const& input, + column_view const& default_outputs, + PrecedingIter preceding_begin_raw, + FollowingIter following_begin_raw, + size_type min_periods, + std::unique_ptr const& agg, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(default_outputs.is_empty(), "COLLECT window function does not support default values."); @@ -1097,49 +1087,38 @@ struct rolling_window_launcher { // Fix up preceding/following iterators to respect column boundaries, // similar to gpu_rolling(). // `rolling_window()` does not fix up preceding/following so as not to read past - // column boundaries. + // column boundaries. // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do. - auto preceding_begin - = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [preceding_begin_raw] __device__(auto i) { - return thrust::min(preceding_begin_raw[i], i+1); - }); - auto following_begin - = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [following_begin_raw, size = input.size()] __device__(auto i) { - return thrust::min(following_begin_raw[i], size - i - 1); - }); + auto preceding_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [preceding_begin_raw] __device__(auto i) { + return thrust::min(preceding_begin_raw[i], i + 1); + }); + auto following_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [following_begin_raw, size = input.size()] __device__(auto i) { + return thrust::min(following_begin_raw[i], size - i - 1); + }); // Materialize collect list's offsets. - auto offsets = get_collect_list_offsets(input, - preceding_begin, - following_begin, - min_periods, - stream, - mr); + auto offsets = + get_collect_list_offsets(input, preceding_begin, following_begin, min_periods, stream, mr); // Map each element of the collect() result's child column // to the index where it appears in the input. auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr); // Generate gather map to produce the collect() result's child column. - auto gather_map = get_gather_map_for_child_column(offsets->view(), - per_row_mapping->view(), - preceding_begin, - stream, - mr); + auto gather_map = get_gather_map_for_child_column( + offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr); // gather(), to construct child column. - auto gather_output = cudf::gather(table_view{std::vector{input}}, gather_map->view()); - + auto gather_output = + cudf::gather(table_view{std::vector{input}}, gather_map->view()); + rmm::device_buffer null_mask; size_type null_count; - std::tie(null_mask, null_count) = get_collect_list_null_mask(input, - preceding_begin, - following_begin, - min_periods, - stream, - mr); + std::tie(null_mask, null_count) = + get_collect_list_null_mask(input, preceding_begin, following_begin, min_periods, stream, mr); return make_lists_column(input.size(), std::move(offsets), diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp index 8b8c9c168d3..235c0558710 100644 --- a/cpp/src/rolling/rolling_detail.hpp +++ b/cpp/src/rolling/rolling_detail.hpp @@ -53,13 +53,13 @@ static constexpr bool is_rolling_supported() } else if (cudf::is_timestamp()) { return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or (op == aggregation::LAG) or - (op == aggregation::COLLECT); + (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or + (op == aggregation::LAG) or (op == aggregation::COLLECT); } else if (cudf::is_fixed_point()) { return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or (op == aggregation::LAG) or - (op == aggregation::COLLECT); + (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or + (op == aggregation::LAG) or (op == aggregation::COLLECT); } else if (std::is_same()) { return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or @@ -71,8 +71,7 @@ static constexpr bool is_rolling_supported() } else if (std::is_same()) { // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER. return op == aggregation::COLLECT; - } - else { + } else { return false; } } diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cu index 3f8019029d8..9ef854a0e11 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cu @@ -32,14 +32,15 @@ #include #include -struct CollectListTest : public cudf::test::BaseFixture {}; +struct CollectListTest : public cudf::test::BaseFixture { +}; template -struct TypedCollectListTest : public CollectListTest {}; +struct TypedCollectListTest : public CollectListTest { +}; -using TypesForTest = cudf::test::Concat; +using TypesForTest = cudf::test:: + Concat; TYPED_TEST_CASE(TypedCollectListTest, TypesForTest); @@ -50,22 +51,26 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) using T = TypeParam; - auto input_column = fixed_width_column_wrapper{10,11,12,13,14}; + auto input_column = fixed_width_column_wrapper{10, 11, 12, 13, 14}; - auto prev_column = fixed_width_column_wrapper{1,2,2,2,2}; - auto foll_column = fixed_width_column_wrapper{1,1,1,1,0}; + auto prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; + auto foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; - EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); + EXPECT_EQ(static_cast(prev_column).size(), + static_cast(foll_column).size()); - auto result_column_based_window = rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation()); + auto result_column_based_window = + rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation()); - auto expected_result = lists_column_wrapper{ - {10, 11}, - {10, 11, 12}, - {11, 12, 13}, - {12, 13, 14}, - {13, 14}, - }.release(); + auto expected_result = + lists_column_wrapper{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {12, 13, 14}, + {13, 14}, + } + .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); @@ -80,23 +85,27 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyLists) using T = TypeParam; - auto input_column = fixed_width_column_wrapper{10,11,12,13,14,15}; + auto input_column = fixed_width_column_wrapper{10, 11, 12, 13, 14, 15}; - auto prev_column = fixed_width_column_wrapper{1,2,2,0,2,2}; - auto foll_column = fixed_width_column_wrapper{1,1,1,0,1,0}; + auto prev_column = fixed_width_column_wrapper{1, 2, 2, 0, 2, 2}; + auto foll_column = fixed_width_column_wrapper{1, 1, 1, 0, 1, 0}; - EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); + EXPECT_EQ(static_cast(prev_column).size(), + static_cast(foll_column).size()); - auto result_column_based_window = rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + auto result_column_based_window = + rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); - auto expected_result = lists_column_wrapper{ - {10, 11}, - {10, 11, 12}, - {11, 12, 13}, - {}, - {13, 14, 15}, - {14, 15}, - }.release(); + auto expected_result = + lists_column_wrapper{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {}, + {13, 14, 15}, + {14, 15}, + } + .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); } @@ -108,22 +117,17 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyListsAtEnds) using T = TypeParam; - auto input_column = fixed_width_column_wrapper{0,1,2,3,4,5}; + auto input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; auto num_elements = static_cast(input_column).size(); - auto prev_column = fixed_width_column_wrapper{0,2,2,2,2,0}; - auto foll_column = fixed_width_column_wrapper{0,1,1,1,1,0}; + auto prev_column = fixed_width_column_wrapper{0, 2, 2, 2, 2, 0}; + auto foll_column = fixed_width_column_wrapper{0, 1, 1, 1, 1, 0}; - auto result = rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + auto result = + rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); - auto expected_result = lists_column_wrapper{ - {}, - {0, 1, 2}, - {1, 2, 3}, - {2, 3, 4}, - {3, 4, 5}, - {} - }.release(); + auto expected_result = + lists_column_wrapper{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -135,46 +139,34 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullLists) using T = TypeParam; - auto input_column = fixed_width_column_wrapper{0,1,2,3,4,5}; + auto input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; auto num_elements = static_cast(input_column).size(); - auto preceding = 2; - auto following = 1; + auto preceding = 2; + auto following = 1; auto min_periods = 3; - auto result = rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto result = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); auto expected_result = lists_column_wrapper{ - { - {}, - {0, 1, 2}, - {1, 2, 3}, - {2, 3, 4}, - {3, 4, 5}, - {} - }, - make_counting_transform_iterator(0, - [num_elements](auto i) { return i != 0 && i != (num_elements - 1); }) - }.release(); + {{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}, + make_counting_transform_iterator(0, [num_elements](auto i) { + return i != 0 && i != (num_elements - 1); + })}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); - preceding = 2; - following = 2; + preceding = 2; + following = 2; min_periods = 4; - auto result_2 = rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto result_2 = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); auto expected_result_2 = lists_column_wrapper{ - { - {}, - {0, 1, 2, 3}, - {1, 2, 3, 4}, - {2, 3, 4, 5}, - {}, - {} - }, - make_counting_transform_iterator(0, - [num_elements](auto i) { return i != 0 && i < 4; }) - }.release(); + {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, + make_counting_transform_iterator(0, [num_elements](auto i) { + return i != 0 && i < 4; + })}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view()); } @@ -186,18 +178,19 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) using T = TypeParam; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = + fixed_width_column_wrapper{10, 11, 12, 13, 14, 20, 21, 22, 23}; - auto const preceding = 2; - auto const following = 1; + auto const preceding = 2; + auto const following = 1; auto const min_periods = 1; - auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, - input_column, - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); auto const expected_result = lists_column_wrapper{ {10, 11}, @@ -208,8 +201,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) {20, 21}, {20, 21, 22}, {21, 22, 23}, - {22, 23} - }.release(); + {22, 23}}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -221,22 +213,23 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) using T = TypeParam; - auto const time_column = fixed_width_column_wrapper - { 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; - auto const preceding = 2; - auto const following = 1; + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = + fixed_width_column_wrapper{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto const preceding = 2; + auto const following = 1; auto const min_periods = 1; - auto const result = grouped_time_range_rolling_window( - table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input_column, - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); auto const expected_result = lists_column_wrapper{ {10, 11, 12, 13}, @@ -247,8 +240,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) {20}, {21, 22}, {21, 22, 23}, - {21, 22, 23} - }.release(); + {21, 22, 23}}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -260,22 +252,23 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) using T = cudf::string_view; - auto const time_column = fixed_width_column_wrapper - { 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; - auto const preceding = 2; - auto const following = 1; + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = + strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; + auto const preceding = 2; + auto const following = 1; auto const min_periods = 1; - auto const result = grouped_time_range_rolling_window( - table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input_column, - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); auto const expected_result = lists_column_wrapper{ {"10", "11", "12", "13"}, @@ -286,8 +279,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) {"20"}, {"21", "22"}, {"21", "22", "23"}, - {"21", "22", "23"} - }.release(); + {"21", "22", "23"}}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -299,62 +291,47 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) using T = TypeParam; - auto const time_column = fixed_width_column_wrapper - { 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto numeric_member_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; - auto string_member_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto numeric_member_column = + fixed_width_column_wrapper{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto string_member_column = + strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; auto struct_members = std::vector>{}; struct_members.emplace_back(numeric_member_column.release()); struct_members.emplace_back(string_member_column.release()); - auto const struct_column = make_structs_column(9, - std::move(struct_members), - 0, - {}); - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = grouped_time_range_rolling_window( - table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - struct_column->view(), - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const struct_column = make_structs_column(9, std::move(struct_members), 0, {}); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + make_collect_aggregation()); auto expected_numeric_column = fixed_width_column_wrapper{ - 10, 11, 12, 13, - 10, 11, 12, 13, - 10, 11, 12, 13, 14, - 10, 11, 12, 13, 14, - 10, 11, 12, 13, 14, - 20, - 21, 22, - 21, 22, 23, - 21, 22, 23 - }; + 10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, + 13, 14, 10, 11, 12, 13, 14, 20, 21, 22, 21, 22, 23, 21, 22, 23}; auto expected_string_column = strings_column_wrapper{ - "10", "11", "12", "13", - "10", "11", "12", "13", - "10", "11", "12", "13", "14", - "10", "11", "12", "13", "14", - "10", "11", "12", "13", "14", - "20", - "21", "22", - "21", "22", "23", - "21", "22", "23" - }; + "10", "11", "12", "13", "10", "11", "12", "13", "10", "11", "12", "13", "14", "10", "11", "12", + "13", "14", "10", "11", "12", "13", "14", "20", "21", "22", "21", "22", "23", "21", "22", "23"}; auto expected_struct_members = std::vector>{}; expected_struct_members.emplace_back(expected_numeric_column.release()); expected_struct_members.emplace_back(expected_string_column.release()); auto expected_structs_column = make_structs_column(32, std::move(expected_struct_members), 0, {}); - auto expected_offsets_column = fixed_width_column_wrapper{0, 4, 8, 13, 18, 23, 24, 26, 29, 32}.release(); - auto expected_result = make_lists_column(9, std::move(expected_offsets_column), std::move(expected_structs_column), 0, {}); + auto expected_offsets_column = + fixed_width_column_wrapper{0, 4, 8, 13, 18, 23, 24, 26, 29, 32}.release(); + auto expected_result = make_lists_column( + 9, std::move(expected_offsets_column), std::move(expected_structs_column), 0, {}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -368,37 +345,37 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) using T = TypeParam; - auto const time_column = fixed_width_column_wrapper - { 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; - auto const preceding = 2; - auto const following = 1; + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = + fixed_width_column_wrapper{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto const preceding = 2; + auto const following = 1; auto const min_periods = 4; - auto const result = grouped_time_range_rolling_window( - table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input_column, - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); auto const expected_result = lists_column_wrapper{ - { - {10, 11, 12, 13}, - {10, 11, 12, 13}, - {10, 11, 12, 13, 14}, - {10, 11, 12, 13, 14}, - {10, 11, 12, 13, 14}, - {}, - {}, - {}, - {} - }, - make_counting_transform_iterator(0, [](auto i){ return i < 5; }) - }.release(); + {{10, 11, 12, 13}, + {10, 11, 12, 13}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {}, + {}, + {}, + {}}, + make_counting_transform_iterator(0, [](auto i) { + return i < 5; + })}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -412,37 +389,37 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) using T = cudf::string_view; - auto const time_column = fixed_width_column_wrapper - { 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; - auto const preceding = 2; - auto const following = 1; + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = + strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; + auto const preceding = 2; + auto const following = 1; auto const min_periods = 4; - auto const result = grouped_time_range_rolling_window( - table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input_column, - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); auto const expected_result = lists_column_wrapper{ - { - {"10", "11", "12", "13"}, - {"10", "11", "12", "13"}, - {"10", "11", "12", "13", "14"}, - {"10", "11", "12", "13", "14"}, - {"10", "11", "12", "13", "14"}, - {}, - {}, - {}, - {} - }, - make_counting_transform_iterator(0, [](auto i){ return i < 5; }) - }.release(); + {{"10", "11", "12", "13"}, + {"10", "11", "12", "13"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {}, + {}, + {}, + {}}, + make_counting_transform_iterator(0, [](auto i) { + return i < 5; + })}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } @@ -456,59 +433,51 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe using T = TypeParam; - auto const time_column = fixed_width_column_wrapper - { 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{ 1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto numeric_member_column = fixed_width_column_wrapper{10,11,12,13,14, 20,21,22,23}; - auto string_member_column = strings_column_wrapper{"10","11","12","13","14", "20","21","22","23"}; + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto numeric_member_column = + fixed_width_column_wrapper{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto string_member_column = + strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; auto struct_members = std::vector>{}; struct_members.emplace_back(numeric_member_column.release()); struct_members.emplace_back(string_member_column.release()); - auto const struct_column = make_structs_column(9, - std::move(struct_members), - 0, - {}); - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 4; - auto const result = grouped_time_range_rolling_window( - table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - struct_column->view(), - preceding, - following, - min_periods, - make_collect_aggregation()); + auto const struct_column = make_structs_column(9, std::move(struct_members), 0, {}); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + make_collect_aggregation()); auto expected_numeric_column = fixed_width_column_wrapper{ - 10, 11, 12, 13, - 10, 11, 12, 13, - 10, 11, 12, 13, 14, - 10, 11, 12, 13, 14, - 10, 11, 12, 13, 14 - }; + 10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14}; - auto expected_string_column = strings_column_wrapper{ - "10", "11", "12", "13", - "10", "11", "12", "13", - "10", "11", "12", "13", "14", - "10", "11", "12", "13", "14", - "10", "11", "12", "13", "14" - }; + auto expected_string_column = + strings_column_wrapper{"10", "11", "12", "13", "10", "11", "12", "13", "10", "11", "12", "13", + "14", "10", "11", "12", "13", "14", "10", "11", "12", "13", "14"}; auto expected_struct_members = std::vector>{}; expected_struct_members.emplace_back(expected_numeric_column.release()); expected_struct_members.emplace_back(expected_string_column.release()); auto expected_structs_column = make_structs_column(23, std::move(expected_struct_members), 0, {}); - auto expected_offsets_column = fixed_width_column_wrapper{0, 4, 8, 13, 18, 23, 23, 23, 23, 23}.release(); - auto expected_validity_iter = make_counting_transform_iterator(0, [](auto i) {return i < 5; }); - auto expected_null_mask = cudf::test::detail::make_null_mask(expected_validity_iter, expected_validity_iter + 9); - auto expected_result = make_lists_column(9, - std::move(expected_offsets_column), - std::move(expected_structs_column), - 4, + auto expected_offsets_column = + fixed_width_column_wrapper{0, 4, 8, 13, 18, 23, 23, 23, 23, 23}.release(); + auto expected_validity_iter = make_counting_transform_iterator(0, [](auto i) { return i < 5; }); + auto expected_null_mask = + cudf::test::detail::make_null_mask(expected_validity_iter, expected_validity_iter + 9); + auto expected_result = make_lists_column(9, + std::move(expected_offsets_column), + std::move(expected_structs_column), + 4, std::move(expected_null_mask)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); From 3aa029ba9fa382e2618150e6ee9613a538253a7d Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Thu, 21 Jan 2021 21:38:00 -0800 Subject: [PATCH 14/24] Moved collect_list_test to .cpp --- cpp/tests/CMakeLists.txt | 2 +- .../{collect_list_test.cu => collect_list_test.cpp} | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) rename cpp/tests/collect_list/{collect_list_test.cu => collect_list_test.cpp} (99%) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index fe273852db6..bc47b937701 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -497,7 +497,7 @@ ConfigureTest(LEAD_LAG_TEST "${LEAD_LAG_TEST_SRC}") # - collect_list rolling tests --------------------------------------------------------------------------------- set(COLLECT_LIST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/collect_list/collect_list_test.cu") + "${CMAKE_CURRENT_SOURCE_DIR}/collect_list/collect_list_test.cpp") ConfigureTest(COLLECT_LIST_TEST "${COLLECT_LIST_SRC}") diff --git a/cpp/tests/collect_list/collect_list_test.cu b/cpp/tests/collect_list/collect_list_test.cpp similarity index 99% rename from cpp/tests/collect_list/collect_list_test.cu rename to cpp/tests/collect_list/collect_list_test.cpp index 9ef854a0e11..1462487bc75 100644 --- a/cpp/tests/collect_list/collect_list_test.cu +++ b/cpp/tests/collect_list/collect_list_test.cpp @@ -118,7 +118,6 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyListsAtEnds) using T = TypeParam; auto input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; - auto num_elements = static_cast(input_column).size(); auto prev_column = fixed_width_column_wrapper{0, 2, 2, 2, 2, 0}; auto foll_column = fixed_width_column_wrapper{0, 1, 1, 1, 1, 0}; @@ -250,8 +249,6 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) using namespace cudf; using namespace cudf::test; - using T = cudf::string_view; - auto const time_column = fixed_width_column_wrapper{ 1, 1, 2, 2, 3, 1, 4, 5, 6}; auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; @@ -387,8 +384,6 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) using namespace cudf; using namespace cudf::test; - using T = cudf::string_view; - auto const time_column = fixed_width_column_wrapper{ 1, 1, 2, 2, 3, 1, 4, 5, 6}; auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; From cf281f0367defdeb10e1989de8cab4da0d8e691e Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Fri, 22 Jan 2021 16:09:08 -0800 Subject: [PATCH 15/24] Const all the things. Added some test descriptions. --- cpp/tests/collect_list/collect_list_test.cpp | 50 +++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/cpp/tests/collect_list/collect_list_test.cpp b/cpp/tests/collect_list/collect_list_test.cpp index 1462487bc75..b17b7835f9e 100644 --- a/cpp/tests/collect_list/collect_list_test.cpp +++ b/cpp/tests/collect_list/collect_list_test.cpp @@ -51,18 +51,18 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) using T = TypeParam; - auto input_column = fixed_width_column_wrapper{10, 11, 12, 13, 14}; + auto const input_column = fixed_width_column_wrapper{10, 11, 12, 13, 14}; - auto prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; - auto foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; + auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; + auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); - auto result_column_based_window = + auto const result_column_based_window = rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation()); - auto expected_result = + auto const expected_result = lists_column_wrapper{ {10, 11}, {10, 11, 12}, @@ -74,29 +74,30 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); - auto result_fixed_window = rolling_window(input_column, 2, 1, 1, make_collect_aggregation()); + auto const result_fixed_window = + rolling_window(input_column, 2, 1, 1, make_collect_aggregation()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view()); } -TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyLists) +TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists) { using namespace cudf; using namespace cudf::test; using T = TypeParam; - auto input_column = fixed_width_column_wrapper{10, 11, 12, 13, 14, 15}; + auto const input_column = fixed_width_column_wrapper{10, 11, 12, 13, 14, 15}; - auto prev_column = fixed_width_column_wrapper{1, 2, 2, 0, 2, 2}; - auto foll_column = fixed_width_column_wrapper{1, 1, 1, 0, 1, 0}; + auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 0, 2, 2}; + auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 0, 1, 0}; EXPECT_EQ(static_cast(prev_column).size(), static_cast(foll_column).size()); - auto result_column_based_window = + auto const result_column_based_window = rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); - auto expected_result = + auto const expected_result = lists_column_wrapper{ {10, 11}, {10, 11, 12}, @@ -110,44 +111,47 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyLists) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); } -TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyListsAtEnds) +TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds) { using namespace cudf; using namespace cudf::test; using T = TypeParam; - auto input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; + auto const input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; - auto prev_column = fixed_width_column_wrapper{0, 2, 2, 2, 2, 0}; - auto foll_column = fixed_width_column_wrapper{0, 1, 1, 1, 1, 0}; + auto const prev_column = fixed_width_column_wrapper{0, 2, 2, 2, 2, 0}; + auto foll_column = fixed_width_column_wrapper{0, 1, 1, 1, 1, 0}; - auto result = + auto const result = rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); - auto expected_result = + auto const expected_result = lists_column_wrapper{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } -TYPED_TEST(TypedCollectListTest, RollingWindowWithNullLists) +TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) { + // Test that when the number of observations is fewer than min_periods, + // the result is null. + using namespace cudf; using namespace cudf::test; using T = TypeParam; - auto input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; - auto num_elements = static_cast(input_column).size(); + auto const input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; + auto const num_elements = static_cast(input_column).size(); auto preceding = 2; auto following = 1; auto min_periods = 3; - auto result = + auto const result = rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); - auto expected_result = lists_column_wrapper{ + auto const expected_result = lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}, make_counting_transform_iterator(0, [num_elements](auto i) { return i != 0 && i != (num_elements - 1); From 06764ad60e6c8bed8b40b8e721d1577488cd5aa7 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 25 Jan 2021 16:44:01 -0800 Subject: [PATCH 16/24] Added tests for collecting decimal columns --- cpp/tests/collect_list/collect_list_test.cpp | 109 +++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/cpp/tests/collect_list/collect_list_test.cpp b/cpp/tests/collect_list/collect_list_test.cpp index b17b7835f9e..2d3056d31cd 100644 --- a/cpp/tests/collect_list/collect_list_test.cpp +++ b/cpp/tests/collect_list/collect_list_test.cpp @@ -174,6 +174,115 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view()); } +TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) +{ + // Test that when the number of observations is fewer than min_periods, + // the result is null. + + using namespace cudf; + using namespace cudf::test; + + auto const input_column = strings_column_wrapper{"0", "1", "2", "3", "4", "5"}; + auto const num_elements = static_cast(input_column).size(); + + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + + auto const expected_result = lists_column_wrapper{ + {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}}, + make_counting_transform_iterator(0, [num_elements](auto i) { + return i != 0 && i != (num_elements - 1); + })}.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); + + preceding = 2; + following = 2; + min_periods = 4; + + auto result_2 = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto expected_result_2 = lists_column_wrapper{ + {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}}, + make_counting_transform_iterator(0, [num_elements](auto i) { + return i != 0 && i < 4; + })}.release(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view()); +} + +TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) +{ + // Test that when the number of observations is fewer than min_periods, + // the result is null. + + using namespace cudf; + using namespace cudf::test; + + auto const input_iter = make_counting_transform_iterator(0, thrust::identity{}); + auto const input_column = + fixed_point_column_wrapper{input_iter, input_iter + 6, numeric::scale_type{0}}; + + { + // One result row at each end should be null. + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + + auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; + auto expected_result_child = + fixed_point_column_wrapper{expected_result_child_values.begin(), + expected_result_child_values.end(), + numeric::scale_type{0}}; + auto expected_offsets = fixed_width_column_wrapper{0, 0, 3, 6, 9, 12, 12}.release(); + auto expected_num_rows = expected_offsets->size() - 1; + auto null_mask_iter = make_counting_transform_iterator( + size_type{0}, [expected_num_rows](auto i) { return i != 0 && i != (expected_num_rows - 1); }); + + auto expected_result = make_lists_column( + expected_num_rows, + std::move(expected_offsets), + expected_result_child.release(), + 2, + cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); + } + + { + // First result row, and the last two result rows should be null. + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + + auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; + auto expected_result_child = + fixed_point_column_wrapper{expected_result_child_values.begin(), + expected_result_child_values.end(), + numeric::scale_type{0}}; + auto expected_offsets = fixed_width_column_wrapper{0, 0, 4, 8, 12, 12, 12}.release(); + auto expected_num_rows = expected_offsets->size() - 1; + auto null_mask_iter = make_counting_transform_iterator( + size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + + auto expected_result = make_lists_column( + expected_num_rows, + std::move(expected_offsets), + expected_result_child.release(), + 3, + cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); + } +} + TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) { using namespace cudf; From f9e418800a224fc915cb34b28e80805b330b0b24 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 25 Jan 2021 22:05:34 -0800 Subject: [PATCH 17/24] Removed namespace directives, for review. --- cpp/src/rolling/rolling_detail.cuh | 23 +++++++------------- cpp/tests/collect_list/collect_list_test.cpp | 2 +- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 51a6392672d..5e1e42c2bd1 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -908,14 +908,12 @@ struct rolling_window_launcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - using namespace cudf; - // Materialize offsets column. - auto size_data_type = data_type{type_to_id()}; + auto static constexpr size_data_type = data_type{type_to_id()}; auto sizes = make_fixed_width_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream, mr); auto mutable_sizes = sizes->mutable_view(); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(stream), preceding_begin, preceding_begin + input.size(), following_begin, @@ -967,9 +965,7 @@ struct rolling_window_launcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - using namespace cudf; - - auto size_data_type = data_type{type_to_id()}; + auto static constexpr size_data_type = data_type{type_to_id()}; // First, reduce offsets column by key, to identify the number of times // an offset appears. @@ -997,7 +993,7 @@ struct rolling_window_launcher { auto scatter_keys = make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr); auto reduced_by_key = - thrust::reduce_by_key(thrust::device, + thrust::reduce_by_key(rmm::exec_policy(stream), offsets.template begin() + 1, // Skip first 0 in offsets. offsets.template end(), thrust::make_constant_iterator(1), @@ -1006,12 +1002,12 @@ struct rolling_window_launcher { auto scatter_values_end = reduced_by_key.second; auto scatter_output = make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr); - thrust::fill_n(thrust::device, + thrust::fill_n(rmm::exec_policy(stream), scatter_output->mutable_view().template begin(), num_child_rows, 0); // [0,0,0,...0] thrust::scatter( - thrust::device, + rmm::exec_policy(stream), scatter_values->mutable_view().template begin(), scatter_values_end, scatter_keys->view().template begin(), @@ -1027,7 +1023,7 @@ struct rolling_window_launcher { // inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5] auto per_row_mapping = make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr); - thrust::inclusive_scan(thrust::device, + thrust::inclusive_scan(rmm::exec_policy(stream), scatter_output->view().template begin(), scatter_output->view().template end(), per_row_mapping->mutable_view().template begin()); @@ -1047,7 +1043,7 @@ struct rolling_window_launcher { stream, mr); thrust::transform( - thrust::device, + rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(per_row_mapping.size()), gather_map->mutable_view().template begin(), @@ -1079,9 +1075,6 @@ struct rolling_window_launcher { CUDF_EXPECTS(default_outputs.is_empty(), "COLLECT window function does not support default values."); - using namespace cudf; - using namespace cudf::detail; - if (input.is_empty()) return empty_like(input); // Fix up preceding/following iterators to respect column boundaries, diff --git a/cpp/tests/collect_list/collect_list_test.cpp b/cpp/tests/collect_list/collect_list_test.cpp index 2d3056d31cd..f42d814979c 100644 --- a/cpp/tests/collect_list/collect_list_test.cpp +++ b/cpp/tests/collect_list/collect_list_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From f897c62772f733629e4d6305761b1447d40df5c4 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 25 Jan 2021 22:32:56 -0800 Subject: [PATCH 18/24] Streamlined null mask construction: Removed unnecessary null count checks. --- cpp/src/rolling/rolling_detail.cuh | 36 +++++++++++++----------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 5e1e42c2bd1..2dd68184e74 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -901,12 +901,12 @@ struct rolling_window_launcher { } template - std::unique_ptr get_collect_list_offsets(column_view const& input, - PrecedingIter preceding_begin, - FollowingIter following_begin, - size_type min_periods, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::unique_ptr create_collect_offsets(column_view const& input, + PrecedingIter preceding_begin, + FollowingIter following_begin, + size_type min_periods, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Materialize offsets column. auto static constexpr size_data_type = data_type{type_to_id()}; @@ -926,7 +926,7 @@ struct rolling_window_launcher { } template - std::pair get_collect_list_null_mask( + std::pair create_collect_null_mask( column_view const& input, PrecedingIter preceding_iter, FollowingIter following_iter, @@ -934,9 +934,7 @@ struct rolling_window_launcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - rmm::device_buffer null_mask; - size_type null_count; - std::tie(null_mask, null_count) = valid_if( + return valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), [preceding_iter, following_iter, min_periods] __device__(auto i) { @@ -944,8 +942,6 @@ struct rolling_window_launcher { }, stream, mr); - return (null_count == 0) ? std::make_pair(rmm::device_buffer{0, stream, mr}, size_type{0}) - : std::make_pair(null_mask, null_count); } /** @@ -1031,11 +1027,11 @@ struct rolling_window_launcher { } template - std::unique_ptr get_gather_map_for_child_column(column_view const& child_offsets, - column_view const& per_row_mapping, - PrecedingIter preceding_iter, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + std::unique_ptr create_collect_gather_map(column_view const& child_offsets, + column_view const& per_row_mapping, + PrecedingIter preceding_iter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto gather_map = make_fixed_width_column(data_type{type_to_id()}, per_row_mapping.size(), @@ -1094,14 +1090,14 @@ struct rolling_window_launcher { // Materialize collect list's offsets. auto offsets = - get_collect_list_offsets(input, preceding_begin, following_begin, min_periods, stream, mr); + create_collect_offsets(input, preceding_begin, following_begin, min_periods, stream, mr); // Map each element of the collect() result's child column // to the index where it appears in the input. auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr); // Generate gather map to produce the collect() result's child column. - auto gather_map = get_gather_map_for_child_column( + auto gather_map = create_collect_gather_map( offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr); // gather(), to construct child column. @@ -1111,7 +1107,7 @@ struct rolling_window_launcher { rmm::device_buffer null_mask; size_type null_count; std::tie(null_mask, null_count) = - get_collect_list_null_mask(input, preceding_begin, following_begin, min_periods, stream, mr); + create_collect_null_mask(input, preceding_begin, following_begin, min_periods, stream, mr); return make_lists_column(input.size(), std::move(offsets), From 3e0c39264a9ec550c57306ccaf4409acf06eaddc Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 25 Jan 2021 23:08:10 -0800 Subject: [PATCH 19/24] Better documentation for COLLECT helper functions. --- cpp/src/rolling/rolling_detail.cuh | 36 +++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 2dd68184e74..cb6fca4a235 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -900,6 +900,16 @@ struct rolling_window_launcher { mr); } + /** + * @brief Creates the offsets child of the result of the `COLLECT` window aggregation + * + * Given the input column, the preceding/following window bounds, and `min_periods`, + * the sizes of each list row may be computed. These values can then be used to + * calculate the offsets for the result of `COLLECT`. + * + * Note: If `min_periods` exceeds the number of observations for a window, the size + * is set to `0` (since the result is `null`). + */ template std::unique_ptr create_collect_offsets(column_view const& input, PrecedingIter preceding_begin, @@ -913,6 +923,17 @@ struct rolling_window_launcher { auto sizes = make_fixed_width_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream, mr); auto mutable_sizes = sizes->mutable_view(); + + // Consider the following preceding/following values: + // preceding = [1,2,2,2,2] + // following = [1,1,1,1,0] + // The sum of the vectors should yield the window sizes: + // prec + foll = [2,3,3,3,2] + // + // If min_periods=2, all rows have at least `min_periods` observations. + // But if min_periods=3, rows at indices 0 and 4 have too few observations, and must return + // null. The sizes at these positions must be 0, i.e. + // prec + foll = [0,3,3,3,0] thrust::transform(rmm::exec_policy(stream), preceding_begin, preceding_begin + input.size(), @@ -921,10 +942,18 @@ struct rolling_window_launcher { [min_periods] __device__(auto preceding, auto following) { return (preceding + following) < min_periods ? 0 : (preceding + following); }); + + // Convert `sizes` to an offsets column, via inclusive_scan(): return strings::detail::make_offsets_child_column( sizes->view().begin(), sizes->view().end(), stream, mr); } + /** + * @brief Create null mask for result of `COLLECT` aggregation. + * + * Given an input column's size, the preceding/following window bounds, and `min_periods`, + * this function returns a null mask, and the count of the number of nulls. + */ template std::pair create_collect_null_mask( column_view const& input, @@ -945,7 +974,8 @@ struct rolling_window_launcher { } /** - * @brief Generate collect() list child's mapping to input column. + * @brief Generate mapping of each row in the COLLECT result's child column + * to the index of the row it belongs to. * * If * input col == [A,B,C,D,E] @@ -1026,6 +1056,10 @@ struct rolling_window_launcher { return per_row_mapping; } + /** + * @brief Create gather map to generate the child column of the result of + * the `COLLECT` window aggregation. + */ template std::unique_ptr create_collect_gather_map(column_view const& child_offsets, column_view const& per_row_mapping, From 04417edb451398f348218854d9f924d08e16462d Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 25 Jan 2021 23:16:36 -0800 Subject: [PATCH 20/24] Update dates in Copyright --- cpp/include/cudf/lists/detail/scatter.cuh | 2 +- cpp/src/rolling/rolling_detail.cuh | 2 +- cpp/src/rolling/rolling_detail.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 15f975c4eb7..0c31b2b79a3 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-21, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index cb6fca4a235..8a8c951e80d 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-21, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp index 235c0558710..d7fa92f1978 100644 --- a/cpp/src/rolling/rolling_detail.hpp +++ b/cpp/src/rolling/rolling_detail.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 93be98c8be32ce8a63a43b5e533ba256f9f2ec04 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 25 Jan 2021 23:28:08 -0800 Subject: [PATCH 21/24] Using input column sizes instead of whole column --- cpp/src/rolling/rolling_detail.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 8a8c951e80d..347ceecc882 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -911,7 +911,7 @@ struct rolling_window_launcher { * is set to `0` (since the result is `null`). */ template - std::unique_ptr create_collect_offsets(column_view const& input, + std::unique_ptr create_collect_offsets(size_type const& input_size, PrecedingIter preceding_begin, FollowingIter following_begin, size_type min_periods, @@ -921,7 +921,7 @@ struct rolling_window_launcher { // Materialize offsets column. auto static constexpr size_data_type = data_type{type_to_id()}; auto sizes = - make_fixed_width_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream, mr); + make_fixed_width_column(size_data_type, input_size, mask_state::UNALLOCATED, stream, mr); auto mutable_sizes = sizes->mutable_view(); // Consider the following preceding/following values: @@ -936,7 +936,7 @@ struct rolling_window_launcher { // prec + foll = [0,3,3,3,0] thrust::transform(rmm::exec_policy(stream), preceding_begin, - preceding_begin + input.size(), + preceding_begin + input_size, following_begin, mutable_sizes.begin(), [min_periods] __device__(auto preceding, auto following) { @@ -956,7 +956,7 @@ struct rolling_window_launcher { */ template std::pair create_collect_null_mask( - column_view const& input, + size_type const& input_size, PrecedingIter preceding_iter, FollowingIter following_iter, size_type min_periods, @@ -965,7 +965,7 @@ struct rolling_window_launcher { { return valid_if( thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), + thrust::make_counting_iterator(input_size), [preceding_iter, following_iter, min_periods] __device__(auto i) { return (preceding_iter[i] + following_iter[i]) >= min_periods; }, @@ -1123,8 +1123,8 @@ struct rolling_window_launcher { }); // Materialize collect list's offsets. - auto offsets = - create_collect_offsets(input, preceding_begin, following_begin, min_periods, stream, mr); + auto offsets = create_collect_offsets( + input.size(), preceding_begin, following_begin, min_periods, stream, mr); // Map each element of the collect() result's child column // to the index where it appears in the input. @@ -1140,8 +1140,8 @@ struct rolling_window_launcher { rmm::device_buffer null_mask; size_type null_count; - std::tie(null_mask, null_count) = - create_collect_null_mask(input, preceding_begin, following_begin, min_periods, stream, mr); + std::tie(null_mask, null_count) = create_collect_null_mask( + input.size(), preceding_begin, following_begin, min_periods, stream, mr); return make_lists_column(input.size(), std::move(offsets), From 76926d431aa9d20749305dcb457a96249aa2be4f Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Tue, 26 Jan 2021 12:58:39 -0800 Subject: [PATCH 22/24] Fixed copyrights. Refactored null mask construction. --- cpp/include/cudf/lists/detail/scatter.cuh | 2 +- cpp/src/rolling/rolling_detail.cuh | 39 ++++++----------------- cpp/tests/CMakeLists.txt | 2 +- 3 files changed, 12 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 0c31b2b79a3..32f6cc6db7a 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-21, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 347ceecc882..381e1d17625 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-21, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -911,7 +911,7 @@ struct rolling_window_launcher { * is set to `0` (since the result is `null`). */ template - std::unique_ptr create_collect_offsets(size_type const& input_size, + std::unique_ptr create_collect_offsets(size_type input_size, PrecedingIter preceding_begin, FollowingIter following_begin, size_type min_periods, @@ -948,31 +948,6 @@ struct rolling_window_launcher { sizes->view().begin(), sizes->view().end(), stream, mr); } - /** - * @brief Create null mask for result of `COLLECT` aggregation. - * - * Given an input column's size, the preceding/following window bounds, and `min_periods`, - * this function returns a null mask, and the count of the number of nulls. - */ - template - std::pair create_collect_null_mask( - size_type const& input_size, - PrecedingIter preceding_iter, - FollowingIter following_iter, - size_type min_periods, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - return valid_if( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input_size), - [preceding_iter, following_iter, min_periods] __device__(auto i) { - return (preceding_iter[i] + following_iter[i]) >= min_periods; - }, - stream, - mr); - } - /** * @brief Generate mapping of each row in the COLLECT result's child column * to the index of the row it belongs to. @@ -1140,8 +1115,14 @@ struct rolling_window_launcher { rmm::device_buffer null_mask; size_type null_count; - std::tie(null_mask, null_count) = create_collect_null_mask( - input.size(), preceding_begin, following_begin, min_periods, stream, mr); + std::tie(null_mask, null_count) = valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + [preceding_begin, following_begin, min_periods] __device__(auto i) { + return (preceding_begin[i] + following_begin[i]) >= min_periods; + }, + stream, + mr); return make_lists_column(input.size(), std::move(offsets), diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index bc47b937701..ad05c871012 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 4bcd8524262de13eb781a5651a547b410d533f6e Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 27 Jan 2021 15:37:06 -0800 Subject: [PATCH 23/24] Test for Input columns with nulls. --- cpp/tests/collect_list/collect_list_test.cpp | 75 ++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/cpp/tests/collect_list/collect_list_test.cpp b/cpp/tests/collect_list/collect_list_test.cpp index f42d814979c..c9dee4b9c39 100644 --- a/cpp/tests/collect_list/collect_list_test.cpp +++ b/cpp/tests/collect_list/collect_list_test.cpp @@ -174,6 +174,81 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view()); } +TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) +{ + // Test that when the number of observations is fewer than min_periods, + // the result is null. + // Input column has null inputs. + + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto const input_column = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5}, {1, 0, 1, 1, 0, 1}}; + // auto const num_elements = static_cast(input_column).size(); + + { + // One result row at each end should be null. + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + + auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; + auto expected_result_child_validity = std::vector{1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1}; + auto expected_result_child = + fixed_width_column_wrapper(expected_result_child_values.begin(), + expected_result_child_values.end(), + expected_result_child_validity.begin()); + auto expected_offsets = fixed_width_column_wrapper{0, 0, 3, 6, 9, 12, 12}.release(); + auto expected_num_rows = expected_offsets->size() - 1; + auto null_mask_iter = make_counting_transform_iterator( + size_type{0}, [expected_num_rows](auto i) { return i != 0 && i != (expected_num_rows - 1); }); + + auto expected_result = make_lists_column( + expected_num_rows, + std::move(expected_offsets), + expected_result_child.release(), + 2, + cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); + } + + { + // First result row, and the last two result rows should be null. + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = + rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + + auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; + auto expected_result_child_validity = std::vector{1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1}; + auto expected_result_child = + fixed_width_column_wrapper(expected_result_child_values.begin(), + expected_result_child_values.end(), + expected_result_child_validity.begin()); + + auto expected_offsets = fixed_width_column_wrapper{0, 0, 4, 8, 12, 12, 12}.release(); + auto expected_num_rows = expected_offsets->size() - 1; + auto null_mask_iter = make_counting_transform_iterator( + size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + + auto expected_result = make_lists_column( + expected_num_rows, + std::move(expected_offsets), + expected_result_child.release(), + 3, + cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); + } +} + TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) { // Test that when the number of observations is fewer than min_periods, From 95a3d49b5f91800d546616758c0f3aeb196a10d0 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 27 Jan 2021 20:48:40 -0800 Subject: [PATCH 24/24] More tests for nulled inputs. --- cpp/tests/collect_list/collect_list_test.cpp | 38 +++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/cpp/tests/collect_list/collect_list_test.cpp b/cpp/tests/collect_list/collect_list_test.cpp index c9dee4b9c39..8021d7171b3 100644 --- a/cpp/tests/collect_list/collect_list_test.cpp +++ b/cpp/tests/collect_list/collect_list_test.cpp @@ -393,6 +393,42 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } +TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) +{ + using namespace cudf; + using namespace cudf::test; + + using T = TypeParam; + + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input_column = fixed_width_column_wrapper{ + {10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; + + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, + input_column, + preceding, + following, + min_periods, + make_collect_aggregation()); + + auto expected_child = fixed_width_column_wrapper{ + {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23}, + {1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1}}; + + auto expected_offsets = fixed_width_column_wrapper{0, 2, 5, 8, 11, 13, 15, 18, 21, 23}; + + auto expected_result = make_lists_column(static_cast(group_column).size(), + expected_offsets.release(), + expected_child.release(), + 0, + {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); +} + TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) { using namespace cudf; @@ -666,4 +702,4 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); } -CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file +CUDF_TEST_PROGRAM_MAIN()