From 763c53ac2b915781c09fb3d2f4daa1c240fdbe15 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 31 Aug 2021 23:31:44 +0530 Subject: [PATCH 01/79] add CORR aggregation to groupby, headers, classes, visitor(sort) --- cpp/include/cudf/aggregation.hpp | 13 +++++++- .../cudf/detail/aggregation/aggregation.hpp | 30 +++++++++++++++++++ cpp/src/aggregation/aggregation.cpp | 20 +++++++++++++ cpp/src/groupby/sort/aggregate.cpp | 23 ++++++++++++++ cpp/src/groupby/sort/group_reductions.hpp | 18 ++++++++++- 5 files changed, 102 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index ff665e2706a..1d03fb613df 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -87,7 +87,8 @@ class aggregation { CUDA, ///< CUDA UDF based reduction MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries - MERGE_M2 ///< merge partial values of M2 aggregation + MERGE_M2, ///< merge partial values of M2 aggregation, + CORR, ///< correlation among multiple columns }; aggregation() = delete; @@ -488,5 +489,15 @@ std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = nu template std::unique_ptr make_merge_m2_aggregation(); +/** + * @brief Factory to create a CORR aggregation + * + * Compute correlation matrix amond the input columns. + * The input columns are child columns of a non-nullable struct columns. + * + */ +template +std::unique_ptr make_corr_aggregation(); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 4e4c63ae517..e2177e64ba0 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -91,6 +91,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class merge_sets_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_m2_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class corr_aggregation const& agg); }; class aggregation_finalizer { // Declares the interface for the finalizer @@ -125,6 +127,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); + virtual void visit(class corr_aggregation const& agg); }; /** @@ -884,6 +887,25 @@ class merge_m2_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived aggregation class for specifying CORR aggregation + */ +class corr_aggregation final : public groupby_aggregation { + public: + explicit corr_aggregation() : aggregation{CORR} {} + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Sentinel value used for `ARGMAX` aggregation. * @@ -1118,6 +1140,12 @@ struct target_type_impl { using type = struct_view; }; +// Always use struct for CORR +template +struct target_type_impl { + using type = double; +}; + /** * @brief Helper alias to get the accumulator type for performing aggregation * `k` on elements of type `Source` @@ -1222,6 +1250,8 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::MERGE_M2: return f.template operator()(std::forward(args)...); + case aggregation::CORR: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index f0c522257fb..07883be1491 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -202,6 +202,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, corr_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + // aggregation_finalizer ---------------------------------------- void aggregation_finalizer::visit(aggregation const& agg) {} @@ -346,6 +352,11 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(corr_aggregation const& agg) +{ + visit(static_cast(agg)); +} + } // namespace detail std::vector> aggregation::get_simple_aggregations( @@ -664,6 +675,15 @@ std::unique_ptr make_merge_m2_aggregation() template std::unique_ptr make_merge_m2_aggregation(); template std::unique_ptr make_merge_m2_aggregation(); +/// Factory to create a CORR aggregation +template +std::unique_ptr make_corr_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_corr_aggregation(); +template std::unique_ptr make_corr_aggregation(); + namespace detail { namespace { struct target_type_functor { diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 726b51b7702..718fd191db1 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -524,6 +524,29 @@ void aggregate_result_functor::operator()(aggregation con detail::group_merge_m2( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); }; +/** + * @brief Perform correlation among child columns of non-nullable struct column. + * + * The output of this aggregation is also a non-nullable struct column. The child columns of the + * output struct column are the corresponding correlation of each input child column. + * + * The correlation is done for each group of the input struct column. + * + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) { return; } + + cache.add_result(col_idx, + agg, + detail::group_corr(get_grouped_values(), + helper.group_offsets(stream), + helper.group_labels(stream), + helper.num_groups(stream), + stream, + mr)); +}; } // namespace detail diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 2770162da2d..6bb87d7ea6a 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -441,7 +441,23 @@ std::unique_ptr group_merge_m2(column_view const& values, size_type num_groups, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); - +/** + * @brief Internal API to find correlation of child columns of a non-nullable struct column. + * TODO fill documentation. + * + * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels ID of group that the corresponding value belongs to + * @param num_groups Number of groups. + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr group_corr(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** @endinternal * */ From 4c989a953e92a028de620c0e311c063d258ee1ca Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 31 Aug 2021 23:38:45 +0530 Subject: [PATCH 02/79] add group_corr.cu --- cpp/CMakeLists.txt | 1 + cpp/src/groupby/sort/group_corr.cu | 279 +++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 cpp/src/groupby/sort/group_corr.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 18af85c98e0..189638e5d08 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -218,6 +218,7 @@ add_library(cudf src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu src/groupby/sort/group_collect.cu + src/groupby/sort/group_corr.cu src/groupby/sort/group_count.cu src/groupby/sort/group_m2.cu src/groupby/sort/group_max.cu diff --git a/cpp/src/groupby/sort/group_corr.cu b/cpp/src/groupby/sort/group_corr.cu new file mode 100644 index 00000000000..dbe64f0d54c --- /dev/null +++ b/cpp/src/groupby/sort/group_corr.cu @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "cudf/types.hpp" +#include "groupby/sort/group_reductions.hpp" +#include "thrust/functional.h" +#include "thrust/iterator/counting_iterator.h" +#include "thrust/iterator/zip_iterator.h" + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +namespace { + +template +constexpr bool is_double_convertible() +{ + return std::is_convertible_v || std::is_constructible_v; +} + +struct is_double_convertible_impl { + template + bool operator()() + { + return is_double_convertible(); + } +}; + +/** + * @brief Type casts each element of the column to `CastType` + * + */ +template +struct type_casted_accessor { + template + CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i, column_device_view const& col) const + { + if constexpr (column_device_view::has_element_accessor() and + std::is_convertible_v) + return static_cast(col.element(i)); + return {}; + } +}; + +template +struct corr_transform { // : thrust::unary_function + column_device_view const d_values_0, d_values_1; + ResultType const *d_means_0, *d_means_1; + ResultType const *d_stddev_0, *d_stddev_1; + size_type const* d_group_sizes; + size_type const* d_group_labels; + size_type ddof{1}; // TODO update based on bias. + + __device__ ResultType operator()(size_type i) + { + if (d_values_0.is_null(i) or d_values_1.is_null(i)) return 0.0; + + // This has to be device dispatch because x and y type may differ + auto x = type_dispatcher(d_values_0.type(), type_casted_accessor{}, i, d_values_0); + auto y = type_dispatcher(d_values_1.type(), type_casted_accessor{}, i, d_values_1); + + size_type group_idx = d_group_labels[i]; + size_type group_size = d_group_sizes[group_idx]; + + // prevent divide by zero error + if (group_size == 0 or group_size - ddof <= 0) return 0.0; + + ResultType xmean = d_means_0[group_idx]; + ResultType ymean = d_means_1[group_idx]; + ResultType xstddev = d_stddev_0[group_idx]; + ResultType ystddev = d_stddev_1[group_idx]; + return (x - xmean) * (y - ymean) / (group_size - ddof) / xstddev / ystddev; + } +}; + +/* +sum((x-xu)*(y-yu)) +transform_output_iterator /N-1, stdx, stdy how do you know the indices? we can not. +So, +(x-xu)*(y-yu))/N-1/stdx/stdy as single iterator., then reduce_by_key. +very similar to var_transform in group_std. +*/ + +std::tuple, std::unique_ptr> group_mean_stddev( + column_view const& values_0, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto sum1 = detail::group_sum(values_0, num_groups, group_labels, stream, mr); + auto count1 = values_0.nullable() + ? detail::group_count_valid(values_0, group_labels, num_groups, stream, mr) + : detail::group_count_all(group_offsets, num_groups, stream, mr); + auto mean1 = + cudf::detail::binary_operation(*sum1, + *count1, + binary_operator::DIV, + cudf::detail::target_type(values_0.type(), aggregation::MEAN), + stream, + mr); + + auto var1 = detail::group_var(values_0, + *mean1, + *count1, + group_labels, + 1, // default var_agg._ddof, + stream, + mr); + auto stddev1 = cudf::detail::unary_operation(*var1, unary_operator::SQRT, stream, mr); + return std::make_tuple(std::move(mean1), std::move(stddev1)); +} + +} // namespace + +// TODO Eventually this function should accept values_0, values_1, not a struct. +std::unique_ptr group_corr(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(values.type().id() == type_id::STRUCT, + "Input to `group_corr` must be a structs column."); + CUDF_EXPECTS(values.num_children() == 2, + "Input to `group_corr` must be a structs column having 2 children columns."); + CUDF_EXPECTS(values.nullable() == false, + "Input to `group_corr` must be a non-nullable structs column."); + std::cout << "size=" << values.size() << std::endl; + std::cout << "num_children=" << values.num_children() << std::endl; + + using result_type = id_to_type; + static_assert( + std::is_same_v, result_type>); + + // check if each child type can be converted to float64. + bool const is_convertible = + std::all_of(values.child_begin(), values.child_end(), [](auto const& c) { + return type_dispatcher(c.type(), is_double_convertible_impl{}); + }); + CUDF_EXPECTS(is_convertible, + "Input to `group_corr` must be a structs column having all children columns of type " + "convertible to float64."); + + // TODO calculate SUM + // TODO calculate COUNT_VALID (need to do for 2 seperately. for MEAN, and + // bitmask_and->COUNT_VALID for CORR.) + // TODO calculate MEAN + // TODO calculate VARIANCE + // TODO calculate STDDEV + // TODO calculate CORR. (requires MEAN1, MEAN2, COUNT_VALID_ANDed, STDDEV1, STDDEV2) + // TODO shuffle. + + auto const& values_0 = values.child(0); + auto const& values_1 = values.child(1); + // TODO fix caching of child sum, count_valid, mean, variance, stddev. [unsupported due to + // result_cache design] + auto [mean0, stddev0] = + group_mean_stddev(values_0, group_offsets, group_labels, num_groups, stream, mr); + auto [mean1, stddev1] = + group_mean_stddev(values_1, group_offsets, group_labels, num_groups, stream, mr); + + auto mean0_ptr = mean0->mutable_view().begin(); + auto mean1_ptr = mean1->mutable_view().begin(); + auto stddev0_ptr = stddev0->mutable_view().begin(); + auto stddev1_ptr = stddev1->mutable_view().begin(); + + // TODO replace with ANDed bitmask. (values, stddev) + auto count1 = values_0.nullable() + ? detail::group_count_valid(values_0, group_labels, num_groups, stream, mr) + : detail::group_count_all(group_offsets, num_groups, stream, mr); + + auto d_values_0 = column_device_view::create(values_0, stream); + auto d_values_1 = column_device_view::create(values_1, stream); + corr_transform corr_transform_op{*d_values_0, + *d_values_1, + mean0_ptr, + mean1_ptr, + stddev0_ptr, + stddev1_ptr, + count1->view().data(), + group_labels.begin()}; + + // result + auto const any_nulls = std::any_of( + values.child_begin(), values.child_end(), [](auto const& c) { return c.has_nulls(); }); + auto mask_type = any_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED; + + auto result = + make_numeric_column(data_type(type_to_id()), num_groups, mask_type, stream, mr); + auto d_result = result->mutable_view().begin(); + + auto corr_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), corr_transform_op); + + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + corr_iter, + thrust::make_discard_iterator(), + d_result); + return result; + + // auto result_M2s = make_numeric_column( + // data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); + // auto validities = rmm::device_uvector(num_groups, stream); + + // // Perform merging for all the aggregations. Their output (and their validity data) are written + // // out concurrently through an output zip iterator. + // using iterator_tuple = thrust::tuple; + // using output_iterator = thrust::zip_iterator; + // auto const out_iter = + // output_iterator{thrust::make_tuple(result_counts->mutable_view().template data(), + // result_means->mutable_view().template data(), + // result_M2s->mutable_view().template data(), + // validities.begin())}; + + // auto const count_valid = values.child(0); + // auto const mean_values = values.child(1); + // auto const M2_values = values.child(2); + // auto const iter = thrust::make_counting_iterator(0); + + // auto const fn = merge_fn{group_offsets.begin(), + // count_valid.template begin(), + // mean_values.template begin(), + // M2_values.template begin()}; + // thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn); + + // // Generate bitmask for the output. + // // Only mean and M2 values can be nullable. Count column must be non-nullable. + // auto [null_mask, null_count] = cudf::detail::valid_if( + // validities.begin(), validities.end(), thrust::identity{}, stream, mr); + // if (null_count > 0) { + // result_means->set_null_mask(null_mask, null_count); // copy null_mask + // result_M2s->set_null_mask(std::move(null_mask), null_count); // take over null_mask + // } + + // Output is a structs column containing the merged values of `COUNT_VALID`, `MEAN`, and `M2`. + + return result; +} + +} // namespace detail +} // namespace groupby +} // namespace cudf From 015795cf875b75f0088be30f5e38c11bcacb6363 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 31 Aug 2021 23:38:57 +0530 Subject: [PATCH 03/79] add unit test temporarily --- cpp/tests/groupby/mean_tests.cpp | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 613e1555b79..9bceebfb241 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -160,5 +161,63 @@ TEST_F(groupby_dictionary_mean_test, basic) keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation()); } +struct groupby_corr_test : public cudf::test::BaseFixture { +}; +template +using fwcw = fixed_width_column_wrapper; +using structs = structs_column_wrapper; + +TEST_F(groupby_corr_test, basic) +{ + using K = int32_t; + using M0 = uint8_t; + using M1 = int16_t; + using R = cudf::detail::target_type_t; + + // clang-format off + auto keys = fwcw { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2 }; + auto member_0 = fwcw{{ 1, 1, 1, 2, 2, 3, 3, 1, 1, 4 }};//, null_at(1)}; + auto member_1 = fwcw{{ 1, 1, 1, 2, -2, 3, 3, 1, 1, -4 }};//, null_at(7)}; + auto values = structs{{member_0, member_1}};//, null_at(4)}; + // clang-format on + + fixed_width_column_wrapper expect_keys({1, 2, 3}); + fixed_width_column_wrapper expect_vals{ + {1.000000, -0.41522739926869984, std::numeric_limits::quiet_NaN()}}; //, null_at(2)}; + // clang-format on + + auto agg = cudf::make_corr_aggregation(); + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = values; + + requests[0].aggregations.push_back(std::move(agg)); + requests.emplace_back(groupby::aggregation_request()); + // WAR to force groupby to use sort implementation + requests[0].aggregations.push_back(make_nth_element_aggregation(0)); + + requests[1].values = column_view(values).child(0); + requests[1].aggregations.push_back(cudf::make_mean_aggregation()); + requests[1].aggregations.push_back(cudf::make_std_aggregation()); + requests.emplace_back(groupby::aggregation_request()); + requests[2].values = column_view(values).child(1); + requests[2].aggregations.push_back(cudf::make_mean_aggregation()); + requests[2].aggregations.push_back(cudf::make_std_aggregation()); + + groupby::groupby gb_obj(table_view({keys})); + auto result = gb_obj.aggregate(requests); + + cudf::test::print(*result.second[0].results[0]); + cudf::test::print(*result.second[1].results[0]); + cudf::test::print(*result.second[1].results[1]); + cudf::test::print(*result.second[2].results[0]); + cudf::test::print(*result.second[2].results[1]); + + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({expect_keys}), result.first->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT( + expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS); + // test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); +} + } // namespace test } // namespace cudf From ba6e50af611def7e73789be74c57971068f1fe7e Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 1 Sep 2021 18:02:13 -0700 Subject: [PATCH 04/79] create new PR for pearson groupby correlation --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 13bfa49057c..19605a60d8d 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -38,17 +38,13 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' + CORRELATION 'cudf::aggregation::CORRELATION' + Kind kind cdef cppclass rolling_aggregation: aggregation.Kind kind - cdef cppclass groupby_aggregation: - aggregation.Kind kind - - cdef cppclass groupby_scan_aggregation: - aggregation.Kind kind - ctypedef enum udf_type: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' From b198a5158f70461ba2d8361dfe9a93f8baa55a26 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 2 Sep 2021 16:32:26 -0700 Subject: [PATCH 05/79] adding corr. func in python --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 2 ++ python/cudf/cudf/core/groupby/groupby.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 19605a60d8d..1bd500facac 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -102,3 +102,5 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: udf_type type, string user_defined_aggregator, data_type output_type) except + + + cdef unique_ptr[T] make_correlation_aggregation[T]() except + diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fd425d9de76..b8de9c7b8cd 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1021,6 +1021,10 @@ def _mimic_pandas_order( result.index = self.obj.index return result + def correlation(self): + """ + """ + class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): """ From 3d0030739742859c50a645a3d3d6e5cdab05263e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 6 Sep 2021 09:40:27 +0530 Subject: [PATCH 06/79] Revert "create new PR for pearson groupby correlation" This reverts commit ba6e50af611def7e73789be74c57971068f1fe7e. --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 1bd500facac..b19f526d539 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -38,13 +38,17 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' - CORRELATION 'cudf::aggregation::CORRELATION' - Kind kind cdef cppclass rolling_aggregation: aggregation.Kind kind + cdef cppclass groupby_aggregation: + aggregation.Kind kind + + cdef cppclass groupby_scan_aggregation: + aggregation.Kind kind + ctypedef enum udf_type: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' From 120043744c2b4da68cf35a4d7a2efa48f8631cbd Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 6 Sep 2021 09:40:45 +0530 Subject: [PATCH 07/79] Revert "adding corr. func in python" This reverts commit b198a5158f70461ba2d8361dfe9a93f8baa55a26. --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 2 -- python/cudf/cudf/core/groupby/groupby.py | 4 ---- 2 files changed, 6 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index b19f526d539..13bfa49057c 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -106,5 +106,3 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: udf_type type, string user_defined_aggregator, data_type output_type) except + - - cdef unique_ptr[T] make_correlation_aggregation[T]() except + diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 045863c8892..d98a78efb18 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1058,10 +1058,6 @@ def _mimic_pandas_order( result.index = self.obj.index return result - def correlation(self): - """ - """ - class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): """ From 60293cc679f172183a4a8a22e49d8de5bc9896d7 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 6 Sep 2021 12:22:50 +0530 Subject: [PATCH 08/79] rename CORR to CORRELATION, added correlation_type as arg --- cpp/include/cudf/aggregation.hpp | 10 +++--- .../cudf/detail/aggregation/aggregation.hpp | 33 +++++++++++++------ cpp/src/aggregation/aggregation.cpp | 14 ++++---- cpp/src/groupby/hash/groupby.cu | 11 +++++++ cpp/src/groupby/sort/aggregate.cpp | 10 ++---- cpp/src/groupby/sort/group_corr.cu | 3 +- cpp/tests/groupby/mean_tests.cpp | 5 +-- 7 files changed, 56 insertions(+), 30 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 1d03fb613df..fdc4e966748 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -88,7 +88,7 @@ class aggregation { MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries MERGE_M2, ///< merge partial values of M2 aggregation, - CORR, ///< correlation among multiple columns + CORRELATION, ///< correlation between two sets of elements }; aggregation() = delete; @@ -144,6 +144,7 @@ class groupby_scan_aggregation : public virtual aggregation { }; enum class udf_type : bool { CUDA, PTX }; +enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN }; /// Factory to create a SUM aggregation template @@ -490,14 +491,15 @@ template std::unique_ptr make_merge_m2_aggregation(); /** - * @brief Factory to create a CORR aggregation + * @brief Factory to create a CORRELATION aggregation * - * Compute correlation matrix amond the input columns. + * Compute correlation coefficient between two columns. * The input columns are child columns of a non-nullable struct columns. * + * @param[in] type: correlation_type */ template -std::unique_ptr make_corr_aggregation(); +std::unique_ptr make_correlation_aggregation(correlation_type type); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index e2177e64ba0..2f7dbd73cb8 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -92,7 +92,7 @@ class simple_aggregations_collector { // Declares the interface for the simple virtual std::vector> visit(data_type col_type, class merge_m2_aggregation const& agg); virtual std::vector> visit(data_type col_type, - class corr_aggregation const& agg); + class correlation_aggregation const& agg); }; class aggregation_finalizer { // Declares the interface for the finalizer @@ -127,7 +127,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); - virtual void visit(class corr_aggregation const& agg); + virtual void visit(class correlation_aggregation const& agg); }; /** @@ -888,15 +888,25 @@ class merge_m2_aggregation final : public groupby_aggregation { }; /** - * @brief Derived aggregation class for specifying CORR aggregation + * @brief Derived aggregation class for specifying CORRELATION aggregation */ -class corr_aggregation final : public groupby_aggregation { +class correlation_aggregation final : public groupby_aggregation { public: - explicit corr_aggregation() : aggregation{CORR} {} + explicit correlation_aggregation(correlation_type type) : aggregation{CORRELATION}, _type{type} {} + correlation_type _type; + + bool is_equal(aggregation const& _other) const override + { + if (!this->aggregation::is_equal(_other)) { return false; } + auto const& other = dynamic_cast(_other); + return (_type == other._type); + } + + size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } std::unique_ptr clone() const override { - return std::make_unique(*this); + return std::make_unique(*this); } std::vector> get_simple_aggregations( data_type col_type, simple_aggregations_collector& collector) const override @@ -904,6 +914,9 @@ class corr_aggregation final : public groupby_aggregation { return collector.visit(col_type, *this); } void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } + + protected: + size_t hash_impl() const { return std::hash{}(static_cast(_type)); } }; /** @@ -1140,9 +1153,9 @@ struct target_type_impl { using type = struct_view; }; -// Always use struct for CORR +// Always use struct for CORRELATION template -struct target_type_impl { +struct target_type_impl { using type = double; }; @@ -1250,8 +1263,8 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::MERGE_M2: return f.template operator()(std::forward(args)...); - case aggregation::CORR: - return f.template operator()(std::forward(args)...); + case aggregation::CORRELATION: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 07883be1491..175b480ce92 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -203,7 +203,7 @@ std::vector> simple_aggregations_collector::visit( } std::vector> simple_aggregations_collector::visit( - data_type col_type, corr_aggregation const& agg) + data_type col_type, correlation_aggregation const& agg) { return visit(col_type, static_cast(agg)); } @@ -352,7 +352,7 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg) visit(static_cast(agg)); } -void aggregation_finalizer::visit(corr_aggregation const& agg) +void aggregation_finalizer::visit(correlation_aggregation const& agg) { visit(static_cast(agg)); } @@ -677,12 +677,14 @@ template std::unique_ptr make_merge_m2_aggregation -std::unique_ptr make_corr_aggregation() +std::unique_ptr make_correlation_aggregation(correlation_type type) { - return std::make_unique(); + return std::make_unique(type); } -template std::unique_ptr make_corr_aggregation(); -template std::unique_ptr make_corr_aggregation(); +template std::unique_ptr make_correlation_aggregation( + correlation_type type); +template std::unique_ptr make_correlation_aggregation( + correlation_type type); namespace detail { namespace { diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 87f83c6edd6..a9c64efc5db 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -157,6 +157,17 @@ class groupby_simple_aggregations_collector final return aggs; } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } }; template diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 718fd191db1..aa7d9ac01c9 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -524,17 +524,13 @@ void aggregate_result_functor::operator()(aggregation con detail::group_merge_m2( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); }; + /** - * @brief Perform correlation among child columns of non-nullable struct column. - * - * The output of this aggregation is also a non-nullable struct column. The child columns of the - * output struct column are the corresponding correlation of each input child column. - * - * The correlation is done for each group of the input struct column. + * @brief Perform correlation betweeen two child columns of non-nullable struct column. * */ template <> -void aggregate_result_functor::operator()(aggregation const& agg) +void aggregate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(col_idx, agg)) { return; } diff --git a/cpp/src/groupby/sort/group_corr.cu b/cpp/src/groupby/sort/group_corr.cu index dbe64f0d54c..35f29a1bb59 100644 --- a/cpp/src/groupby/sort/group_corr.cu +++ b/cpp/src/groupby/sort/group_corr.cu @@ -165,7 +165,8 @@ std::unique_ptr group_corr(column_view const& values, using result_type = id_to_type; static_assert( - std::is_same_v, result_type>); + std::is_same_v, + result_type>); // check if each child type can be converted to float64. bool const is_convertible = diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 9bceebfb241..9cbeca8163f 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -172,7 +172,7 @@ TEST_F(groupby_corr_test, basic) using K = int32_t; using M0 = uint8_t; using M1 = int16_t; - using R = cudf::detail::target_type_t; + using R = cudf::detail::target_type_t; // clang-format off auto keys = fwcw { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2 }; @@ -186,7 +186,8 @@ TEST_F(groupby_corr_test, basic) {1.000000, -0.41522739926869984, std::numeric_limits::quiet_NaN()}}; //, null_at(2)}; // clang-format on - auto agg = cudf::make_corr_aggregation(); + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); std::vector requests; requests.emplace_back(groupby::aggregation_request()); requests[0].values = values; From d421d6d818dd676c1b0717c694d89c11c5a9f835 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 8 Sep 2021 00:03:48 +0530 Subject: [PATCH 09/79] add shallow_hash(column_view) --- cpp/include/cudf/column/column_view.hpp | 20 ++++++++++++++++++++ cpp/include/cudf/types.hpp | 12 ++++++++++++ cpp/src/column/column_view.cpp | 19 +++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 7feaeafbad0..43386e926d2 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -633,4 +633,24 @@ column_view bit_cast(column_view const& input, data_type type); */ mutable_column_view bit_cast(mutable_column_view const& input, data_type type); +namespace detail { +/** + * @brief Computes a hash value on the specified column view based on the shallow state of the + * column view. + * + * Only the shallow states (i.e pointers instead of data pointed by the pointer) of the column view + * are used in the hash computation. The hash value is computed recursively on the children of the + * column view. + * The states used for the hash computation are: type, size, data pointer, null_mask pointer, + * offset, and the hash value of the children. Note that `null_count` is not used. + * + * Note: This hash function may result in different hash for a copy of the same column with exactly + * same contents. It is guarenteed to give same hash value for same column_view only, even if the + * underlying data changes. + * + * @param input The `column_view` to compute hash + * @return The hash value + */ +size_t shallow_hash(column_view const& input); +} // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index e1037efb5c8..37c5a4b424c 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -345,3 +345,15 @@ static constexpr uint32_t DEFAULT_HASH_SEED = 0; /** @} */ } // namespace cudf + +// specialization of std::hash for cudf::data_type +namespace std { +template <> +struct hash { + std::size_t operator()(cudf::data_type const& type) const noexcept + { + return std::hash{}(static_cast(type.id())) * 127 + + std::hash{}(type.scale()); + } +}; +} // namespace std diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 186669ae697..d1202108ae5 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -22,6 +22,7 @@ #include +#include #include #include #include @@ -76,6 +77,24 @@ size_type column_view_base::null_count(size_type begin, size_type end) const ? 0 : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end); } + +// simple prime number multiplication algorithm. +// Adapted from http://myeyesareblind.com/2017/02/06/Combine-hash-values/#apachecommons +constexpr void combine_hash(size_t& h1, size_t h2) { h1 = h1 * 127 + h2; } + +size_t shallow_hash(column_view const& input) +{ + size_t hash = 0; + combine_hash(hash, std::hash{}(input.type())); + combine_hash(hash, std::hash{}(input.size())); + combine_hash(hash, std::hash{}(input.head())); + combine_hash(hash, std::hash{}(input.null_mask())); + combine_hash(hash, std::hash{}(input.offset())); + std::for_each(input.child_begin(), input.child_end(), [&hash](auto const& child) { + combine_hash(hash, shallow_hash(child)); + }); + return hash; +} } // namespace detail // Immutable view constructor From 9c4a9f315338545c9616914ac8e2a73b38596cb5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 8 Sep 2021 00:13:43 +0530 Subject: [PATCH 10/79] add CompoundTypes to type_lists --- cpp/include/cudf_test/type_lists.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp index 5c1b0c6c458..49550e55e33 100644 --- a/cpp/include/cudf_test/type_lists.hpp +++ b/cpp/include/cudf_test/type_lists.hpp @@ -303,6 +303,18 @@ using FixedWidthTypesWithoutFixedPoint = Concat; */ using ComparableTypes = Concat; +/** + * @brief Provides a list of all compound types for use in GTest typed tests. + * + * Example: + * ``` + * // Invokes all typed fixture tests for all compound types in libcudf + * TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes); + * ``` + */ +using CompoundTypes = + cudf::test::Types; + /** * @brief Provides a list of all types supported in libcudf for use in a GTest * typed test. From a3dd235a48307b779d999644fa4a55679b759a40 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 8 Sep 2021 00:14:04 +0530 Subject: [PATCH 11/79] add shallow_hash tests --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/column/column_view_shallow_test.cpp | 211 ++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 cpp/tests/column/column_view_shallow_test.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d9553d463ab..8b608c2bfd0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -33,6 +33,7 @@ endfunction() # - column tests ---------------------------------------------------------------------------------- ConfigureTest(COLUMN_TEST column/bit_cast_test.cpp + column/column_view_shallow_test.cpp column/column_test.cu column/column_device_view_test.cu column/compound_test.cu) diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp new file mode 100644 index 00000000000..09e7e4eb689 --- /dev/null +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +template +std::unique_ptr example_column() +{ + // fixed_width, dict, string, list, struct + if constexpr (cudf::is_fixed_width()) { + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + return cudf::test::fixed_width_column_wrapper(begin, end).release(); + } else if constexpr (cudf::is_dictionary()) { + return cudf::test::dictionary_column_wrapper( + {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0}) + .release(); + } else if constexpr (std::is_same_v or std::is_same_v) { + return cudf::test::strings_column_wrapper( + {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}) + .release(); + } else if constexpr (std::is_same_v) { + return cudf::test::lists_column_wrapper({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release(); + } else if constexpr (std::is_same_v) { + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + auto member_0 = cudf::test::fixed_width_column_wrapper(begin, end); + auto member_1 = cudf::test::fixed_width_column_wrapper(begin + 10, end + 10); + return cudf::test::structs_column_wrapper({member_0, member_1}).release(); + } + return {}; +} + +template +struct ColumnViewShallowTests : public cudf::test::BaseFixture { +}; + +using AllTypes = cudf::test::Concat; +TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes); + +// Test for fixed_width, dict, string, list, struct +// column_view, column_view = same hash. +// column_view, make a copy = same hash. +// column_view old, update data + new column_view = same hash. +// column_view old, add null_mask + new column_view = diff hash. +// column_view old, update nulls + new column_view = same hash. +// column_view old, set_null_count + new column_view = same hash. +// +// column_view, diff column = diff hash. +// column_view, sliced[0, size) = same hash (for split too) +// column_view, sliced[n:) = diff hash (for split too) +// column_view, bit_cast = diff hash +// +// mutable_column_view, column_view = same hash +// mutable_column_view, modified mutable_column_view = same hash +// +// update the children column data = same hash +// update the children column_views = diff hash + +TYPED_TEST(ColumnViewShallowTests, shallow_hash) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // same = same hash + { + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view)); + } + // copy column_view = same hash + { + auto col_view_copy = col_view; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy)); + } + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + // update data + new column_view = same hash. + { + // update data by modifying some bits: fixed_width, string, dict, list, struct + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + // add null_mask + new column_view = diff hash. + { + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); + auto col_view_new = cudf::column_view{*col}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new)); + col_view_new.null_count(); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new)); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2)); + } + col_view = cudf::column_view{*col}; // updating after adding null_mask + // update nulls + new column_view = same hash. + { + cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) + { + col->set_null_count(cudf::UNKNOWN_NULL_COUNT); + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + col->set_null_count(col->size()); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new2)); + } + + // column_view, diff column = diff hash. + { + auto col_diff = example_column(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff)); + } + // column_view, sliced[0, size] = same hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_sliced[0])); + auto col_split = cudf::split(col_view, {0}); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0])); + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_split[1])); + } + // column_view, sliced[n:] = diff hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_sliced[0])); + auto col_split = cudf::split(col_view, {1}); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0])); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[1])); + } + // column_view, bit_cast = diff hash + { + if constexpr (std::is_integral_v and not std::is_same_v) { + using newType = std::conditional_t, + std::make_unsigned_t, + std::make_signed_t>; + auto new_type = cudf::data_type(cudf::type_to_id()); + auto col_bitcast = cudf::bit_cast(col_view, new_type); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_bitcast)); + } + } + // mutable_column_view, column_view = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_view)); + } + // mutable_column_view, modified mutable_column_view = same hash + // update the children column data = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_mutable)); + auto col_mutable_new = cudf::mutable_column_view{*col}; + EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_mutable_new)); + } + // update the children column_views = diff hash + { + if constexpr (cudf::is_nested()) { + col->child(0).set_null_mask( + cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); + auto col_child_updated = cudf::mutable_column_view{*col}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_child_updated)); + } + } +} From 2365d07960dfefc2ec5f22fb05c6b471de022945 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 8 Sep 2021 01:29:27 +0530 Subject: [PATCH 12/79] add column copy test --- cpp/tests/column/column_view_shallow_test.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 09e7e4eb689..b0f6eeac450 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -27,6 +27,7 @@ #include +#include #include template @@ -67,6 +68,7 @@ TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes); // Test for fixed_width, dict, string, list, struct // column_view, column_view = same hash. // column_view, make a copy = same hash. +// column_view, copy column = diff hash // column_view old, update data + new column_view = same hash. // column_view old, add null_mask + new column_view = diff hash. // column_view old, update nulls + new column_view = same hash. @@ -97,6 +99,12 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) auto col_view_copy = col_view; EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy)); } + // copy column = diff hash + { + auto col_new = std::make_unique(*col); + auto col_view_copy = col_new->view(); + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_copy)); + } // new column_view from column = same hash { auto col_view_new = cudf::column_view{*col}; From 88726a451fc267c462ce7f9d1486f74bde2782ab Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 8 Sep 2021 01:34:07 +0530 Subject: [PATCH 13/79] add shallow_equal(column_view) and tests --- cpp/include/cudf/column/column_view.hpp | 17 +++ cpp/src/column/column_view.cpp | 14 ++ cpp/tests/column/column_view_shallow_test.cpp | 133 ++++++++++++++++++ 3 files changed, 164 insertions(+) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 43386e926d2..a77351fe731 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -652,5 +652,22 @@ namespace detail { * @return The hash value */ size_t shallow_hash(column_view const& input); + +/** + * @brief Equality operator for column views based on the shallow state of the column view. + * + * Only shallow states used for the hash computation are: type, size, data pointer, null_mask + * pointer, offset and the column_view of the children recursively. Note that `null_count` is not + * used. + * + * Note: This equality function will consider a column not equal to a copy of the same column with + * exactly same contents. It is guarenteed to return true for same column_view only, even if the + * underlying data changes. + * + * @param lhs The left `column_view` to compare + * @param rhs The right `column_view` to compare + * @return true if the shallow states of the two column views are equal + */ +bool shallow_equal(column_view const& lhs, column_view const& rhs); } // namespace detail } // namespace cudf diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index d1202108ae5..7e0bde86b74 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -95,6 +95,20 @@ size_t shallow_hash(column_view const& input) }); return hash; } + +bool is_shallow_equal(column_view const& lhs, column_view const& rhs) +{ + return (lhs.type() == rhs.type()) and (lhs.size() == rhs.size()) and + (lhs.head() == rhs.head()) and (lhs.null_mask() == rhs.null_mask()) and + (lhs.offset() == rhs.offset()) and + std::equal(lhs.child_begin(), + lhs.child_end(), + rhs.child_begin(), + rhs.child_end(), + [](auto const& lhs_child, auto const& rhs_child) { + return is_shallow_equal(lhs_child, rhs_child); + }); +} } // namespace detail // Immutable view constructor diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index b0f6eeac450..25af9b968e6 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -217,3 +217,136 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) } } } + +TYPED_TEST(ColumnViewShallowTests, shallow_equal) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // same = same hash + { + EXPECT_TRUE(shallow_equal(col_view, col_view)); + } + // copy column_view = same hash + { + auto col_view_copy = col_view; + EXPECT_TRUE(shallow_equal(col_view, col_view_copy)); + } + // copy column = diff hash + { + auto col_new = std::make_unique(*col); + auto col_view_copy = col_new->view(); + EXPECT_FALSE(shallow_equal(col_view, col_view_copy)); + } + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + } + // update data + new column_view = same hash. + { + // update data by modifying some bits: fixed_width, string, dict, list, struct + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 2, 64, true); + } + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + } + // add null_mask + new column_view = diff hash. + { + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); + auto col_view_new = cudf::column_view{*col}; + EXPECT_FALSE(shallow_equal(col_view, col_view_new)); + col_view_new.null_count(); + EXPECT_FALSE(shallow_equal(col_view, col_view_new)); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view_new, col_view_new2)); + } + col_view = cudf::column_view{*col}; // updating after adding null_mask + // update nulls + new column_view = same hash. + { + cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + } + // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) + { + col->set_null_count(cudf::UNKNOWN_NULL_COUNT); + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + col->set_null_count(col->size()); + auto col_view_new2 = cudf::column_view{*col}; + EXPECT_TRUE(shallow_equal(col_view, col_view_new2)); + } + + // column_view, diff column = diff hash. + { + auto col_diff = example_column(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_FALSE(shallow_equal(col_view, col_view_diff)); + } + // column_view, sliced[0, size] = same hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); + EXPECT_TRUE(shallow_equal(col_view, col_sliced[0])); + auto col_split = cudf::split(col_view, {0}); + EXPECT_FALSE(shallow_equal(col_view, col_split[0])); + EXPECT_TRUE(shallow_equal(col_view, col_split[1])); + } + // column_view, sliced[n:] = diff hash (for split too) + { + auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); + EXPECT_FALSE(shallow_equal(col_view, col_sliced[0])); + auto col_split = cudf::split(col_view, {1}); + EXPECT_FALSE(shallow_equal(col_view, col_split[0])); + EXPECT_FALSE(shallow_equal(col_view, col_split[1])); + } + // column_view, bit_cast = diff hash + { + if constexpr (std::is_integral_v and not std::is_same_v) { + using newType = std::conditional_t, + std::make_unsigned_t, + std::make_signed_t>; + auto new_type = cudf::data_type(cudf::type_to_id()); + auto col_bitcast = cudf::bit_cast(col_view, new_type); + EXPECT_FALSE(shallow_equal(col_view, col_bitcast)); + } + } + // mutable_column_view, column_view = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + EXPECT_TRUE(shallow_equal(col_mutable, col_view)); + } + // mutable_column_view, modified mutable_column_view = same hash + // update the children column data = same hash + { + auto col_mutable = cudf::mutable_column_view{*col}; + if constexpr (cudf::is_fixed_width()) { + // Update data + auto data = reinterpret_cast(col->mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } else { + // Update child(0).data + auto data = reinterpret_cast(col->child(0).mutable_view().head()); + cudf::set_null_mask(data, 1, 32, false); + } + EXPECT_TRUE(shallow_equal(col_view, col_mutable)); + auto col_mutable_new = cudf::mutable_column_view{*col}; + EXPECT_TRUE(shallow_equal(col_mutable, col_mutable_new)); + } + // update the children column_views = diff hash + { + if constexpr (cudf::is_nested()) { + col->child(0).set_null_mask( + cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); + auto col_child_updated = cudf::mutable_column_view{*col}; + EXPECT_FALSE(shallow_equal(col_view, col_child_updated)); + } + } +} From d52509de3f69b02378f268b4540e0565a7c4589e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 8 Sep 2021 12:23:26 +0530 Subject: [PATCH 14/79] update result_cache to use shallow_hash, shallow_equal --- .../cudf/detail/aggregation/result_cache.hpp | 30 +++-- cpp/src/aggregation/result_cache.cpp | 33 +++-- cpp/src/groupby/common/utils.hpp | 2 +- cpp/src/groupby/hash/groupby.cu | 106 +++++++--------- cpp/src/groupby/sort/aggregate.cpp | 115 +++++++++--------- cpp/src/groupby/sort/functors.hpp | 6 +- cpp/src/groupby/sort/scan.cpp | 31 +++-- 7 files changed, 152 insertions(+), 171 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index ebb1ea784e5..a15e15d7d01 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -23,15 +23,19 @@ namespace cudf { namespace detail { -struct aggregation_equality { - bool operator()(aggregation const& lhs, aggregation const& rhs) const +struct pair_column_aggregation_equal_to { + bool operator()(std::pair const& lhs, + std::pair const& rhs) const { - return lhs.is_equal(rhs); + return is_shallow_equal(lhs.first, rhs.first) and lhs.second.is_equal(rhs.second); } }; -struct aggregation_hash { - size_t operator()(aggregation const& key) const noexcept { return key.do_hash(); } +struct pair_column_aggregation_hash { + size_t operator()(std::pair const& key) const noexcept + { + return shallow_hash(key.first) * 127 + key.second.do_hash(); + } }; class result_cache { @@ -43,19 +47,19 @@ class result_cache { result_cache(size_t num_columns) : _cache(num_columns) {} - bool has_result(size_t col_idx, aggregation const& agg) const; + bool has_result(column_view const& input, aggregation const& agg) const; - void add_result(size_t col_idx, aggregation const& agg, std::unique_ptr&& col); + void add_result(column_view const& input, aggregation const& agg, std::unique_ptr&& col); - column_view get_result(size_t col_idx, aggregation const& agg) const; + column_view get_result(column_view const& input, aggregation const& agg) const; - std::unique_ptr release_result(size_t col_idx, aggregation const& agg); + std::unique_ptr release_result(column_view const& input, aggregation const& agg); private: - std::vector, - std::pair, std::unique_ptr>, - aggregation_hash, - aggregation_equality>> + std::unordered_map>, + std::pair, std::unique_ptr>, + pair_column_aggregation_hash, + pair_column_aggregation_equal_to> _cache; }; diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp index 36668af5355..04750f7fa98 100644 --- a/cpp/src/aggregation/result_cache.cpp +++ b/cpp/src/aggregation/result_cache.cpp @@ -19,38 +19,37 @@ namespace cudf { namespace detail { -bool result_cache::has_result(size_t col_idx, aggregation const& agg) const +bool result_cache::has_result(column_view const& input, aggregation const& agg) const { - if (col_idx > _cache.size()) return false; - - auto result_it = _cache[col_idx].find(agg); - - return (result_it != _cache[col_idx].end()); + return _cache.count({input, agg}); } -void result_cache::add_result(size_t col_idx, aggregation const& agg, std::unique_ptr&& col) +void result_cache::add_result(column_view const& input, + aggregation const& agg, + std::unique_ptr&& col) { // We can't guarantee that agg will outlive the cache, so we need to take ownership of a copy. // To allow lookup by reference, make the key a reference and keep the owner in the value pair. - auto owned_agg = agg.clone(); - auto const& key = *owned_agg; - auto value = std::make_pair(std::move(owned_agg), std::move(col)); - _cache[col_idx].emplace(key, std::move(value)); + auto owned_agg = agg.clone(); + auto const& key = *owned_agg; + auto value = std::make_pair(std::move(owned_agg), std::move(col)); + _cache[{input, key}] = std::move(value); } -column_view result_cache::get_result(size_t col_idx, aggregation const& agg) const +column_view result_cache::get_result(column_view const& input, aggregation const& agg) const { - CUDF_EXPECTS(has_result(col_idx, agg), "Result does not exist in cache"); + CUDF_EXPECTS(has_result(input, agg), "Result does not exist in cache"); - auto result_it = _cache[col_idx].find(agg); + auto result_it = _cache.find({input, agg}); return result_it->second.second->view(); } -std::unique_ptr result_cache::release_result(size_t col_idx, aggregation const& agg) +std::unique_ptr result_cache::release_result(column_view const& input, + aggregation const& agg) { - CUDF_EXPECTS(has_result(col_idx, agg), "Result does not exist in cache"); + CUDF_EXPECTS(has_result(input, agg), "Result does not exist in cache"); - auto result_it = _cache[col_idx].extract(agg); + auto result_it = _cache.extract({input, agg}); return std::move(result_it.mapped().second); } diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp index 3da20fb9af3..129351c3d38 100644 --- a/cpp/src/groupby/common/utils.hpp +++ b/cpp/src/groupby/common/utils.hpp @@ -33,7 +33,7 @@ inline std::vector extract_results(host_span class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - size_t col_idx; column_view col; data_type result_type; cudf::detail::result_cache* sparse_results; @@ -170,14 +169,13 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final size_type const map_size; Map const& map; bitmask_type const* __restrict__ row_bitmask; - rmm::mr::device_memory_resource* mr; rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; public: using cudf::detail::aggregation_finalizer::visit; - hash_compound_agg_finalizer(size_t col_idx, - column_view col, + hash_compound_agg_finalizer(column_view col, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, @@ -186,8 +184,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final bitmask_type const* row_bitmask, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : col_idx(col_idx), - col(col), + : col(col), sparse_results(sparse_results), dense_results(dense_results), gather_map(gather_map), @@ -203,7 +200,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final auto to_dense_agg_result(cudf::aggregation const& agg) { - auto s = sparse_results->get_result(col_idx, agg); + auto s = sparse_results->get_result(col, agg); auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), gather_map.begin(), gather_map.begin() + map_size, @@ -240,43 +237,43 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final // Declare overloads for each kind of aggregation to dispatch void visit(cudf::aggregation const& agg) override { - if (dense_results->has_result(col_idx, agg)) return; - dense_results->add_result(col_idx, agg, to_dense_agg_result(agg)); + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); } void visit(cudf::detail::min_aggregation const& agg) override { - if (dense_results->has_result(col_idx, agg)) return; + if (dense_results->has_result(col, agg)) return; if (result_type.id() == type_id::STRING) { auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col_idx, agg, gather_argminmax(*transformed_agg)); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); } else { - dense_results->add_result(col_idx, agg, to_dense_agg_result(agg)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); } } void visit(cudf::detail::max_aggregation const& agg) override { - if (dense_results->has_result(col_idx, agg)) return; + if (dense_results->has_result(col, agg)) return; if (result_type.id() == type_id::STRING) { auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col_idx, agg, gather_argminmax(*transformed_agg)); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); } else { - dense_results->add_result(col_idx, agg, to_dense_agg_result(agg)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); } } void visit(cudf::detail::mean_aggregation const& agg) override { - if (dense_results->has_result(col_idx, agg)) return; + if (dense_results->has_result(col, agg)) return; auto sum_agg = make_sum_aggregation(); auto count_agg = make_count_aggregation(); this->visit(*sum_agg); this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col_idx, *sum_agg); - column_view count_result = dense_results->get_result(col_idx, *count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); auto result = cudf::detail::binary_operation(sum_result, @@ -285,19 +282,19 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final cudf::detail::target_type(result_type, aggregation::MEAN), stream, mr); - dense_results->add_result(col_idx, agg, std::move(result)); + dense_results->add_result(col, agg, std::move(result)); } void visit(cudf::detail::var_aggregation const& agg) override { - if (dense_results->has_result(col_idx, agg)) return; + if (dense_results->has_result(col, agg)) return; auto sum_agg = make_sum_aggregation(); auto count_agg = make_count_aggregation(); this->visit(*sum_agg); this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col_idx, *sum_agg); - column_view count_result = sparse_results->get_result(col_idx, *count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); auto values_view = column_device_view::create(col); auto sum_view = column_device_view::create(sum_result); @@ -315,47 +312,40 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final col.size(), ::cudf::detail::var_hash_functor{ map, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col_idx, agg, std::move(var_result)); - dense_results->add_result(col_idx, agg, to_dense_agg_result(agg)); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); } void visit(cudf::detail::std_aggregation const& agg) override { - if (dense_results->has_result(col_idx, agg)) return; + if (dense_results->has_result(col, agg)) return; auto var_agg = make_variance_aggregation(agg._ddof); this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col_idx, *var_agg); + column_view variance = dense_results->get_result(col, *var_agg); auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col_idx, agg, std::move(result)); + dense_results->add_result(col, agg, std::move(result)); } }; // flatten aggs to filter in single pass aggs -std::tuple, - std::vector>, - std::vector> +std::tuple, std::vector>> flatten_single_pass_aggs(host_span requests) { std::vector columns; std::vector> aggs; std::vector agg_kinds; - std::vector col_ids; - for (size_t i = 0; i < requests.size(); i++) { - auto const& request = requests[i]; - auto const& agg_v = request.aggregations; + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; std::unordered_set agg_kinds_set; - auto insert_agg = - [&](size_t i, column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - col_ids.push_back(i); - } - }; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; auto values_type = cudf::is_dictionary(request.values.type()) ? cudf::dictionary_column_view(request.values).keys().type() @@ -364,13 +354,12 @@ flatten_single_pass_aggs(host_span requests) groupby_simple_aggregations_collector collector; for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(i, request.values, std::move(agg_s)); + insert_agg(request.values, std::move(agg_s)); } } } - return std::make_tuple( - table_view(columns), std::move(agg_kinds), std::move(aggs), std::move(col_ids)); + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); } /** @@ -397,22 +386,14 @@ void sparse_to_dense_results(table_view const& keys, bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - for (size_t i = 0; i < requests.size(); i++) { - auto const& agg_v = requests[i].aggregations; - auto const& col = requests[i].values; + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; // Given an aggregation, this will get the result from sparse_results and // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer(i, - col, - sparse_results, - dense_results, - gather_map, - map_size, - map, - row_bitmask_ptr, - stream, - mr); + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, map_size, map, row_bitmask_ptr, stream, mr); for (auto&& agg : agg_v) { agg->finalize(finalizer); } @@ -500,7 +481,7 @@ void compute_single_pass_aggs(table_view const& keys, rmm::cuda_stream_view stream) { // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs, col_ids] = flatten_single_pass_aggs(requests); + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); // make table that will hold sparse results table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); @@ -528,7 +509,8 @@ void compute_single_pass_aggs(table_view const& keys, auto sparse_result_cols = sparse_table.release(); for (size_t i = 0; i < aggs.size(); i++) { // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result(col_ids[i], *aggs[i], std::move(sparse_result_cols[i])); + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); } } diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 726b51b7702..b4143de7a86 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -64,10 +64,10 @@ struct aggregate_result_functor final : store_result_functor { template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, get_grouped_values().nullable() ? detail::group_count_valid( @@ -79,10 +79,10 @@ void aggregate_result_functor::operator()(aggregation template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, detail::group_count_all(helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); } @@ -90,10 +90,10 @@ void aggregate_result_functor::operator()(aggregation co template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, detail::group_sum( get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr)); @@ -102,10 +102,10 @@ void aggregate_result_functor::operator()(aggregation const& a template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, detail::group_product( get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr)); @@ -114,9 +114,9 @@ void aggregate_result_functor::operator()(aggregation cons template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; - cache.add_result(col_idx, + cache.add_result(values, agg, detail::group_argmax(get_grouped_values(), helper.num_groups(stream), @@ -129,9 +129,9 @@ void aggregate_result_functor::operator()(aggregation const template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; - cache.add_result(col_idx, + cache.add_result(values, agg, detail::group_argmin(get_grouped_values(), helper.num_groups(stream), @@ -144,7 +144,7 @@ void aggregate_result_functor::operator()(aggregation const template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto result = [&]() { auto values_type = cudf::is_dictionary(values.type()) @@ -156,7 +156,7 @@ void aggregate_result_functor::operator()(aggregation const& a } else { auto argmin_agg = make_argmin_aggregation(); operator()(*argmin_agg); - column_view argmin_result = cache.get_result(col_idx, *argmin_agg); + column_view argmin_result = cache.get_result(values, *argmin_agg); // We make a view of ARGMIN result without a null mask and gather using // this mask. The values in data buffer of ARGMIN result corresponding @@ -178,13 +178,13 @@ void aggregate_result_functor::operator()(aggregation const& a } }(); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto result = [&]() { auto values_type = cudf::is_dictionary(values.type()) @@ -196,7 +196,7 @@ void aggregate_result_functor::operator()(aggregation const& a } else { auto argmax_agg = make_argmax_aggregation(); operator()(*argmax_agg); - column_view argmax_result = cache.get_result(col_idx, *argmax_agg); + column_view argmax_result = cache.get_result(values, *argmax_agg); // We make a view of ARGMAX result without a null mask and gather using // this mask. The values in data buffer of ARGMAX result corresponding @@ -218,20 +218,20 @@ void aggregate_result_functor::operator()(aggregation const& a } }(); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto sum_agg = make_sum_aggregation(); auto count_agg = make_count_aggregation(); operator()(*sum_agg); operator()(*count_agg); - column_view sum_result = cache.get_result(col_idx, *sum_agg); - column_view count_result = cache.get_result(col_idx, *count_agg); + column_view sum_result = cache.get_result(values, *sum_agg); + column_view count_result = cache.get_result(values, *count_agg); // TODO (dm): Special case for timestamp. Add target_type_impl for it. // Blocked until we support operator+ on timestamps @@ -242,20 +242,20 @@ void aggregate_result_functor::operator()(aggregation const& cudf::detail::target_type(values.type(), aggregation::MEAN), stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto const mean_agg = make_mean_aggregation(); operator()(*mean_agg); - auto const mean_result = cache.get_result(col_idx, *mean_agg); + auto const mean_result = cache.get_result(values, *mean_agg); cache.add_result( - col_idx, + values, agg, detail::group_m2(get_grouped_values(), mean_result, helper.group_labels(stream), stream, mr)); }; @@ -263,15 +263,15 @@ void aggregate_result_functor::operator()(aggregation const& ag template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto& var_agg = dynamic_cast(agg); auto mean_agg = make_mean_aggregation(); auto count_agg = make_count_aggregation(); operator()(*mean_agg); operator()(*count_agg); - column_view mean_result = cache.get_result(col_idx, *mean_agg); - column_view group_sizes = cache.get_result(col_idx, *count_agg); + column_view mean_result = cache.get_result(values, *mean_agg); + column_view group_sizes = cache.get_result(values, *count_agg); auto result = detail::group_var(get_grouped_values(), mean_result, @@ -280,31 +280,31 @@ void aggregate_result_functor::operator()(aggregation con var_agg._ddof, stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto& std_agg = dynamic_cast(agg); auto var_agg = make_variance_aggregation(std_agg._ddof); operator()(*var_agg); - column_view var_result = cache.get_result(col_idx, *var_agg); + column_view var_result = cache.get_result(values, *var_agg); auto result = cudf::detail::unary_operation(var_result, unary_operator::SQRT, stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto count_agg = make_count_aggregation(); operator()(*count_agg); - column_view group_sizes = cache.get_result(col_idx, *count_agg); + column_view group_sizes = cache.get_result(values, *count_agg); auto& quantile_agg = dynamic_cast(agg); auto result = detail::group_quantiles(get_sorted_values(), @@ -315,17 +315,17 @@ void aggregate_result_functor::operator()(aggregation con quantile_agg._interpolation, stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto count_agg = make_count_aggregation(); operator()(*count_agg); - column_view group_sizes = cache.get_result(col_idx, *count_agg); + column_view group_sizes = cache.get_result(values, *count_agg); auto result = detail::group_quantiles(get_sorted_values(), group_sizes, @@ -335,13 +335,13 @@ void aggregate_result_functor::operator()(aggregation const interpolation::LINEAR, stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto& nunique_agg = dynamic_cast(agg); @@ -352,13 +352,13 @@ void aggregate_result_functor::operator()(aggregation cons nunique_agg._null_handling, stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; auto& nth_element_agg = dynamic_cast(agg); @@ -370,9 +370,9 @@ void aggregate_result_functor::operator()(aggregation } else { CUDF_FAIL("Wrong count aggregation kind"); } - column_view group_sizes = cache.get_result(col_idx, *count_agg); + column_view group_sizes = cache.get_result(values, *count_agg); - cache.add_result(col_idx, + cache.add_result(values, agg, detail::group_nth_element(get_grouped_values(), group_sizes, @@ -388,7 +388,7 @@ void aggregate_result_functor::operator()(aggregation template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) { return; } + if (cache.has_result(values, agg)) { return; } auto const null_handling = dynamic_cast(agg)._null_handling; @@ -398,13 +398,13 @@ void aggregate_result_functor::operator()(aggregation null_handling, stream, mr); - cache.add_result(col_idx, agg, std::move(result)); + cache.add_result(values, agg, std::move(result)); }; template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) { return; } + if (cache.has_result(values, agg)) { return; } auto const null_handling = dynamic_cast(agg)._null_handling; @@ -419,7 +419,7 @@ void aggregate_result_functor::operator()(aggregation auto const nans_equal = dynamic_cast(agg)._nans_equal; cache.add_result( - col_idx, + values, agg, lists::detail::drop_list_duplicates( lists_column_view(collect_result->view()), nulls_equal, nans_equal, stream, mr)); @@ -443,10 +443,10 @@ void aggregate_result_functor::operator()(aggregation template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) { return; } + if (cache.has_result(values, agg)) { return; } cache.add_result( - col_idx, + values, agg, detail::group_merge_lists( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); @@ -479,7 +479,7 @@ void aggregate_result_functor::operator()(aggregation template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) { return; } + if (cache.has_result(values, agg)) { return; } auto const merged_result = detail::group_merge_lists(get_grouped_values(), helper.group_offsets(stream), @@ -487,7 +487,7 @@ void aggregate_result_functor::operator()(aggregation c stream, rmm::mr::get_current_device_resource()); auto const& merge_sets_agg = dynamic_cast(agg); - cache.add_result(col_idx, + cache.add_result(values, agg, lists::detail::drop_list_duplicates(lists_column_view(merged_result->view()), merge_sets_agg._nulls_equal, @@ -516,10 +516,10 @@ void aggregate_result_functor::operator()(aggregation c template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) { return; } + if (cache.has_result(values, agg)) { return; } cache.add_result( - col_idx, + values, agg, detail::group_merge_m2( get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); @@ -538,13 +538,12 @@ std::pair, std::vector> groupby::sort // sum and count. std depends on mean and count cudf::detail::result_cache cache(requests.size()); - for (size_t i = 0; i < requests.size(); i++) { + for (auto const& request : requests) { auto store_functor = - detail::aggregate_result_functor(i, requests[i].values, helper(), cache, stream, mr); - for (size_t j = 0; j < requests[i].aggregations.size(); j++) { + detail::aggregate_result_functor(request.values, helper(), cache, stream, mr); + for (auto const& agg : request.aggregations) { // TODO (dm): single pass compute all supported reductions - cudf::detail::aggregation_dispatcher( - requests[i].aggregations[j]->kind, store_functor, *requests[i].aggregations[j]); + cudf::detail::aggregation_dispatcher(agg->kind, store_functor, *agg); } } diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp index afb92f8e141..cbe5f08639a 100644 --- a/cpp/src/groupby/sort/functors.hpp +++ b/cpp/src/groupby/sort/functors.hpp @@ -36,13 +36,12 @@ namespace detail { * of these values. */ struct store_result_functor { - store_result_functor(size_type col_idx, - column_view const& values, + store_result_functor(column_view const& values, sort::sort_groupby_helper& helper, cudf::detail::result_cache& cache, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr) + : helper(helper), cache(cache), values(values), stream(stream), mr(mr) { } @@ -80,7 +79,6 @@ struct store_result_functor { }; protected: - size_type col_idx; ///< Index of column in requests being operated on sort::sort_groupby_helper& helper; ///< Sort helper cudf::detail::result_cache& cache; ///< cache of results to store into column_view const& values; ///< Column of values to group and aggregate diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index c43df77bb5e..3e260dee8c4 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -66,10 +66,10 @@ struct scan_result_functor final : store_result_functor { template <> void scan_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, detail::sum_scan( get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr)); @@ -78,10 +78,10 @@ void scan_result_functor::operator()(aggregation const& agg) template <> void scan_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, detail::min_scan( get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr)); @@ -90,10 +90,10 @@ void scan_result_functor::operator()(aggregation const& agg) template <> void scan_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; cache.add_result( - col_idx, + values, agg, detail::max_scan( get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr)); @@ -102,15 +102,15 @@ void scan_result_functor::operator()(aggregation const& agg) template <> void scan_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; - cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(stream), stream, mr)); + cache.add_result(values, agg, detail::count_scan(helper.group_labels(stream), stream, mr)); } template <> void scan_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; CUDF_EXPECTS(helper.is_presorted(), "Rank aggregate in groupby scan requires the keys to be presorted"); auto const order_by = get_grouped_values(); @@ -122,7 +122,7 @@ void scan_result_functor::operator()(aggregation const& agg) "Unsupported nested columns in grouped rank scan."); cache.add_result( - col_idx, + values, agg, detail::rank_scan( order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); @@ -131,7 +131,7 @@ void scan_result_functor::operator()(aggregation const& agg) template <> void scan_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) return; + if (cache.has_result(values, agg)) return; CUDF_EXPECTS(helper.is_presorted(), "Dense rank aggregate in groupby scan requires the keys to be presorted"); auto const order_by = get_grouped_values(); @@ -143,7 +143,7 @@ void scan_result_functor::operator()(aggregation const& "Unsupported nested columns in grouped dense_rank scan."); cache.add_result( - col_idx, + values, agg, detail::dense_rank_scan( order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); @@ -161,10 +161,9 @@ std::pair, std::vector> groupby::sort // sum and count. std depends on mean and count cudf::detail::result_cache cache(requests.size()); - for (size_t i = 0; i < requests.size(); i++) { - auto store_functor = - detail::scan_result_functor(i, requests[i].values, helper(), cache, stream, mr); - for (auto const& aggregation : requests[i].aggregations) { + for (auto const& request : requests) { + auto store_functor = detail::scan_result_functor(request.values, helper(), cache, stream, mr); + for (auto const& aggregation : request.aggregations) { // TODO (dm): single pass compute all supported reductions cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation); } From d9a8bd77261d6bc57f82249bf318729ce85506af Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 8 Sep 2021 12:42:24 +0530 Subject: [PATCH 15/79] Update cpp/include/cudf/column/column_view.hpp Co-authored-by: Jake Hemstad --- cpp/include/cudf/column/column_view.hpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 43386e926d2..03e3c201a4b 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -635,18 +635,13 @@ mutable_column_view bit_cast(mutable_column_view const& input, data_type type); namespace detail { /** - * @brief Computes a hash value on the specified column view based on the shallow state of the - * column view. + * @brief Computes a hash value from the shallow state of the specified column * - * Only the shallow states (i.e pointers instead of data pointed by the pointer) of the column view - * are used in the hash computation. The hash value is computed recursively on the children of the - * column view. - * The states used for the hash computation are: type, size, data pointer, null_mask pointer, - * offset, and the hash value of the children. Note that `null_count` is not used. + * Two `column_view`s, `c1` and `c2`, that view the exact same physical column will produce equal `shallow_hash()` values, i.e., `is_shallow_equal(c0, c1)` implies `shallow_hash(c0) == shallow_hash(c1)`. * - * Note: This hash function may result in different hash for a copy of the same column with exactly - * same contents. It is guarenteed to give same hash value for same column_view only, even if the - * underlying data changes. + * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., it is independent of the number of elements in the column. + * + * This function does _not_ inspect the elements of `input` nor access any device memory or launch any kernels. * * @param input The `column_view` to compute hash * @return The hash value From d96f870309d4c1cb0e7b93f4b32cfdfff543313c Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 8 Sep 2021 10:23:53 -0700 Subject: [PATCH 16/79] added definition of correlation() in cython --- Untitled.ipynb | 33 +++++++++++++++++++++++ python/cudf/cudf/_lib/aggregation.pyx | 15 +++++++++++ python/cudf/cudf/_lib/cpp/aggregation.pxd | 4 +++ 3 files changed, 52 insertions(+) create mode 100644 Untitled.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 00000000000..e38548d42a9 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "06d9628c-d48e-40cb-a90b-ab83ce92af3b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 097018fe3c0..1ee329a545e 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -321,6 +321,13 @@ cdef class Aggregation: )) return agg + @classmethod + def correlation(cls): + cdef Aggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_correlation_aggregation[aggregation]()) + return agg + cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -674,6 +681,14 @@ cdef class GroupbyAggregation: ) return agg + @classmethod + def correlation(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_correlation_aggregation[groupby_aggregation]()) + return agg + cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 13bfa49057c..db4c5f023a6 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -38,6 +38,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' + CORRELATION 'cudf::aggregation::CORRELATION' + Kind kind cdef cppclass rolling_aggregation: @@ -106,3 +108,5 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: udf_type type, string user_defined_aggregator, data_type output_type) except + + + cdef unique_ptr[T] make_correlation_aggregation[T]() except + From 7e7f250d5f85b06a81a26c536f24d9e69c5b8831 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 13 Sep 2021 14:47:24 +0530 Subject: [PATCH 17/79] ignore data, nullmask, offset if parent size is empty --- cpp/include/cudf/column/column_view.hpp | 12 +- cpp/src/column/column_view.cpp | 39 ++++-- cpp/tests/column/column_view_shallow_test.cpp | 126 +++++++++++++----- 3 files changed, 123 insertions(+), 54 deletions(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 03e3c201a4b..b7cf833d063 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -637,15 +637,19 @@ namespace detail { /** * @brief Computes a hash value from the shallow state of the specified column * - * Two `column_view`s, `c1` and `c2`, that view the exact same physical column will produce equal `shallow_hash()` values, i.e., `is_shallow_equal(c0, c1)` implies `shallow_hash(c0) == shallow_hash(c1)`. + * Two `column_view`s, `c1` and `c2`, that view the exact same physical column will produce equal + * `shallow_hash()` values, i.e., `is_shallow_equal(c0, c1)` implies `shallow_hash(c0) == + * shallow_hash(c1)`. * - * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., it is independent of the number of elements in the column. + * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., + * it is independent of the number of elements in the column. * - * This function does _not_ inspect the elements of `input` nor access any device memory or launch any kernels. + * This function does _not_ inspect the elements of `input` nor access any device memory or launch + * any kernels. * * @param input The `column_view` to compute hash * @return The hash value */ -size_t shallow_hash(column_view const& input); +std::size_t shallow_hash(column_view const& input); } // namespace detail } // namespace cudf diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index d1202108ae5..c0c1f9d4acd 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -80,21 +80,32 @@ size_type column_view_base::null_count(size_type begin, size_type end) const // simple prime number multiplication algorithm. // Adapted from http://myeyesareblind.com/2017/02/06/Combine-hash-values/#apachecommons -constexpr void combine_hash(size_t& h1, size_t h2) { h1 = h1 * 127 + h2; } +constexpr void combine_hash(std::size_t& h1, std::size_t h2) { h1 = h1 * 127 + h2; } + +struct shallow_hash_impl { + std::size_t operator()(column_view const& input, bool is_parent_empty = false) + { + std::size_t hash = 0; + combine_hash(hash, std::hash{}(input.type())); + combine_hash(hash, std::hash{}(input.size())); + if (not(input.is_empty() or is_parent_empty)) { + combine_hash(hash, std::hash{}(input.head())); + combine_hash(hash, std::hash{}(input.null_mask())); + combine_hash(hash, std::hash{}(input.offset())); + } + hash = std::accumulate( + input.child_begin(), + input.child_end(), + hash, + [&input, is_parent_empty](std::size_t hash, auto const& child) { + combine_hash(hash, shallow_hash_impl{}(child, input.is_empty() or is_parent_empty)); + return hash; + }); + return hash; + } +}; -size_t shallow_hash(column_view const& input) -{ - size_t hash = 0; - combine_hash(hash, std::hash{}(input.type())); - combine_hash(hash, std::hash{}(input.size())); - combine_hash(hash, std::hash{}(input.head())); - combine_hash(hash, std::hash{}(input.null_mask())); - combine_hash(hash, std::hash{}(input.offset())); - std::for_each(input.child_begin(), input.child_end(), [&hash](auto const& child) { - combine_hash(hash, shallow_hash(child)); - }); - return hash; -} +std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl{}(input); } } // namespace detail // Immutable view constructor diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index b0f6eeac450..2d881a3c872 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -30,32 +30,48 @@ #include #include -template +// fixed_width, dict, string, list, struct +template ()>* = nullptr> +std::unique_ptr example_column() +{ + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + return cudf::test::fixed_width_column_wrapper(begin, end).release(); +} + +template ()>* = nullptr> +std::unique_ptr example_column() +{ + return cudf::test::dictionary_column_wrapper( + {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0}) + .release(); +} + +template or + std::is_same_v>* = nullptr> +std::unique_ptr example_column() + +{ + return cudf::test::strings_column_wrapper( + {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}) + .release(); +} + +template >* = nullptr> +std::unique_ptr example_column() +{ + return cudf::test::lists_column_wrapper({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release(); +} + +template >* = nullptr> std::unique_ptr example_column() { - // fixed_width, dict, string, list, struct - if constexpr (cudf::is_fixed_width()) { - auto begin = thrust::make_counting_iterator(1); - auto end = thrust::make_counting_iterator(16); - return cudf::test::fixed_width_column_wrapper(begin, end).release(); - } else if constexpr (cudf::is_dictionary()) { - return cudf::test::dictionary_column_wrapper( - {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0}) - .release(); - } else if constexpr (std::is_same_v or std::is_same_v) { - return cudf::test::strings_column_wrapper( - {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}) - .release(); - } else if constexpr (std::is_same_v) { - return cudf::test::lists_column_wrapper({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release(); - } else if constexpr (std::is_same_v) { - auto begin = thrust::make_counting_iterator(1); - auto end = thrust::make_counting_iterator(16); - auto member_0 = cudf::test::fixed_width_column_wrapper(begin, end); - auto member_1 = cudf::test::fixed_width_column_wrapper(begin + 10, end + 10); - return cudf::test::structs_column_wrapper({member_0, member_1}).release(); - } - return {}; + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + auto member_0 = cudf::test::fixed_width_column_wrapper(begin, end); + auto member_1 = cudf::test::fixed_width_column_wrapper(begin + 10, end + 10); + return cudf::test::structs_column_wrapper({member_0, member_1}).release(); } template @@ -68,13 +84,15 @@ TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes); // Test for fixed_width, dict, string, list, struct // column_view, column_view = same hash. // column_view, make a copy = same hash. +// new column_view from colmn = same hash // column_view, copy column = diff hash +// column_view, diff column = diff hash. +// // column_view old, update data + new column_view = same hash. // column_view old, add null_mask + new column_view = diff hash. // column_view old, update nulls + new column_view = same hash. // column_view old, set_null_count + new column_view = same hash. // -// column_view, diff column = diff hash. // column_view, sliced[0, size) = same hash (for split too) // column_view, sliced[n:) = diff hash (for split too) // column_view, bit_cast = diff hash @@ -85,7 +103,7 @@ TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes); // update the children column data = same hash // update the children column_views = diff hash -TYPED_TEST(ColumnViewShallowTests, shallow_hash) +TYPED_TEST(ColumnViewShallowTests, shallow_hash_basic) { using namespace cudf::detail; auto col = example_column(); @@ -99,17 +117,32 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) auto col_view_copy = col_view; EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy)); } + + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + } + // copy column = diff hash { auto col_new = std::make_unique(*col); auto col_view_copy = col_new->view(); EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_copy)); } - // new column_view from column = same hash + + // column_view, diff column = diff hash. { - auto col_view_new = cudf::column_view{*col}; - EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new)); + auto col_diff = example_column(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff)); } +} +TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; // update data + new column_view = same hash. { // update data by modifying some bits: fixed_width, string, dict, list, struct @@ -151,14 +184,14 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) auto col_view_new2 = cudf::column_view{*col}; EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new2)); } +} - // column_view, diff column = diff hash. - { - auto col_diff = example_column(); - auto col_view_diff = cudf::column_view{*col_diff}; - EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff)); - } - // column_view, sliced[0, size] = same hash (for split too) +TYPED_TEST(ColumnViewShallowTests, shallow_hash_slice) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // column_view, sliced[0, size) = same hash (for split too) { auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_sliced[0])); @@ -174,6 +207,20 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0])); EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[1])); } + // column_view, col copy sliced[0, 0) = same hash (empty column) + { + auto col_new = std::make_unique(*col); + auto col_new_view = col_new->view(); + auto col_sliced = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + + EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_sliced[1])); + EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_sliced[2])); + EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_new_sliced[0])); + EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_new_sliced[1])); + EXPECT_EQ(shallow_hash(col_sliced[2]), shallow_hash(col_new_sliced[2])); + } + // column_view, bit_cast = diff hash { if constexpr (std::is_integral_v and not std::is_same_v) { @@ -185,6 +232,13 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash) EXPECT_NE(shallow_hash(col_view), shallow_hash(col_bitcast)); } } +} + +TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; // mutable_column_view, column_view = same hash { auto col_mutable = cudf::mutable_column_view{*col}; From 00051540682e8340ad4b04e59c7c56bdf43bbcbb Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 14 Sep 2021 02:25:04 +0530 Subject: [PATCH 18/79] is_shallow_equal ignore children states for empty column. (not children type) --- cpp/include/cudf/column/column_view.hpp | 2 +- cpp/src/column/column_view.cpp | 22 ++++ cpp/tests/column/column_view_shallow_test.cpp | 100 ++++++++++++------ 3 files changed, 91 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index b490eae44d3..25d13d04207 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -667,6 +667,6 @@ std::size_t shallow_hash(column_view const& input); * @param rhs The right `column_view` to compare * @return true if the shallow states of the two column views are equal */ -bool shallow_equal(column_view const& lhs, column_view const& rhs); +bool is_shallow_equal(column_view const& lhs, column_view const& rhs); } // namespace detail } // namespace cudf diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index c0c1f9d4acd..4447273d7ff 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -106,6 +106,28 @@ struct shallow_hash_impl { }; std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl{}(input); } + +struct shallow_equal_impl { + bool operator()(column_view const& lhs, column_view const& rhs, bool is_parent_empty = false) + { + bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty; + return (lhs.type() == rhs.type()) and + (is_empty or + ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and + (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and + std::equal(lhs.child_begin(), + lhs.child_end(), + rhs.child_begin(), + rhs.child_end(), + [is_empty](auto const& lhs_child, auto const& rhs_child) { + return shallow_equal_impl{}(lhs_child, rhs_child, is_empty); + }); + } +}; +bool is_shallow_equal(column_view const& lhs, column_view const& rhs) +{ + return shallow_equal_impl{}(lhs, rhs); +} } // namespace detail // Immutable view constructor diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 58fa28397a8..6858911c54f 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -272,31 +272,46 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable) } } -TYPED_TEST(ColumnViewShallowTests, shallow_equal) +TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_basic) { using namespace cudf::detail; auto col = example_column(); auto col_view = cudf::column_view{*col}; // same = same hash { - EXPECT_TRUE(shallow_equal(col_view, col_view)); + EXPECT_TRUE(is_shallow_equal(col_view, col_view)); } // copy column_view = same hash { auto col_view_copy = col_view; - EXPECT_TRUE(shallow_equal(col_view, col_view_copy)); + EXPECT_TRUE(is_shallow_equal(col_view, col_view_copy)); } + + // new column_view from column = same hash + { + auto col_view_new = cudf::column_view{*col}; + EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); + } + // copy column = diff hash { auto col_new = std::make_unique(*col); auto col_view_copy = col_new->view(); - EXPECT_FALSE(shallow_equal(col_view, col_view_copy)); + EXPECT_FALSE(is_shallow_equal(col_view, col_view_copy)); } - // new column_view from column = same hash + + // column_view, diff column = diff hash. { - auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + auto col_diff = example_column(); + auto col_view_diff = cudf::column_view{*col_diff}; + EXPECT_FALSE(is_shallow_equal(col_view, col_view_diff)); } +} +TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_update_data) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; // update data + new column_view = same hash. { // update data by modifying some bits: fixed_width, string, dict, list, struct @@ -310,57 +325,71 @@ TYPED_TEST(ColumnViewShallowTests, shallow_equal) cudf::set_null_mask(data, 2, 64, true); } auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); } // add null_mask + new column_view = diff hash. { col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); auto col_view_new = cudf::column_view{*col}; - EXPECT_FALSE(shallow_equal(col_view, col_view_new)); + EXPECT_FALSE(is_shallow_equal(col_view, col_view_new)); col_view_new.null_count(); - EXPECT_FALSE(shallow_equal(col_view, col_view_new)); + EXPECT_FALSE(is_shallow_equal(col_view, col_view_new)); auto col_view_new2 = cudf::column_view{*col}; - EXPECT_TRUE(shallow_equal(col_view_new, col_view_new2)); + EXPECT_TRUE(is_shallow_equal(col_view_new, col_view_new2)); } col_view = cudf::column_view{*col}; // updating after adding null_mask // update nulls + new column_view = same hash. { cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); } // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) { col->set_null_count(cudf::UNKNOWN_NULL_COUNT); auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); col->set_null_count(col->size()); auto col_view_new2 = cudf::column_view{*col}; - EXPECT_TRUE(shallow_equal(col_view, col_view_new2)); + EXPECT_TRUE(is_shallow_equal(col_view, col_view_new2)); } +} - // column_view, diff column = diff hash. - { - auto col_diff = example_column(); - auto col_view_diff = cudf::column_view{*col_diff}; - EXPECT_FALSE(shallow_equal(col_view, col_view_diff)); - } - // column_view, sliced[0, size] = same hash (for split too) +TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_slice) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + // column_view, sliced[0, size) = same hash (for split too) { auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); - EXPECT_TRUE(shallow_equal(col_view, col_sliced[0])); + EXPECT_TRUE(is_shallow_equal(col_view, col_sliced[0])); auto col_split = cudf::split(col_view, {0}); - EXPECT_FALSE(shallow_equal(col_view, col_split[0])); - EXPECT_TRUE(shallow_equal(col_view, col_split[1])); + EXPECT_FALSE(is_shallow_equal(col_view, col_split[0])); + EXPECT_TRUE(is_shallow_equal(col_view, col_split[1])); } // column_view, sliced[n:] = diff hash (for split too) { auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); - EXPECT_FALSE(shallow_equal(col_view, col_sliced[0])); + EXPECT_FALSE(is_shallow_equal(col_view, col_sliced[0])); auto col_split = cudf::split(col_view, {1}); - EXPECT_FALSE(shallow_equal(col_view, col_split[0])); - EXPECT_FALSE(shallow_equal(col_view, col_split[1])); + EXPECT_FALSE(is_shallow_equal(col_view, col_split[0])); + EXPECT_FALSE(is_shallow_equal(col_view, col_split[1])); } + // column_view, col copy sliced[0, 0) = same hash (empty column) + { + auto col_new = std::make_unique(*col); + auto col_new_view = col_new->view(); + auto col_sliced = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); + + EXPECT_TRUE(is_shallow_equal(col_sliced[0], col_sliced[1])); + EXPECT_TRUE(is_shallow_equal(col_sliced[1], col_sliced[2])); + EXPECT_TRUE(is_shallow_equal(col_sliced[0], col_new_sliced[0])); + EXPECT_TRUE(is_shallow_equal(col_sliced[1], col_new_sliced[1])); + EXPECT_TRUE(is_shallow_equal(col_sliced[2], col_new_sliced[2])); + } + // column_view, bit_cast = diff hash { if constexpr (std::is_integral_v and not std::is_same_v) { @@ -369,13 +398,20 @@ TYPED_TEST(ColumnViewShallowTests, shallow_equal) std::make_signed_t>; auto new_type = cudf::data_type(cudf::type_to_id()); auto col_bitcast = cudf::bit_cast(col_view, new_type); - EXPECT_FALSE(shallow_equal(col_view, col_bitcast)); + EXPECT_FALSE(is_shallow_equal(col_view, col_bitcast)); } } +} + +TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_mutable) +{ + using namespace cudf::detail; + auto col = example_column(); + auto col_view = cudf::column_view{*col}; // mutable_column_view, column_view = same hash { auto col_mutable = cudf::mutable_column_view{*col}; - EXPECT_TRUE(shallow_equal(col_mutable, col_view)); + EXPECT_TRUE(is_shallow_equal(col_mutable, col_view)); } // mutable_column_view, modified mutable_column_view = same hash // update the children column data = same hash @@ -390,9 +426,9 @@ TYPED_TEST(ColumnViewShallowTests, shallow_equal) auto data = reinterpret_cast(col->child(0).mutable_view().head()); cudf::set_null_mask(data, 1, 32, false); } - EXPECT_TRUE(shallow_equal(col_view, col_mutable)); + EXPECT_TRUE(is_shallow_equal(col_view, col_mutable)); auto col_mutable_new = cudf::mutable_column_view{*col}; - EXPECT_TRUE(shallow_equal(col_mutable, col_mutable_new)); + EXPECT_TRUE(is_shallow_equal(col_mutable, col_mutable_new)); } // update the children column_views = diff hash { @@ -400,7 +436,7 @@ TYPED_TEST(ColumnViewShallowTests, shallow_equal) col->child(0).set_null_mask( cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); auto col_child_updated = cudf::mutable_column_view{*col}; - EXPECT_FALSE(shallow_equal(col_view, col_child_updated)); + EXPECT_FALSE(is_shallow_equal(col_view, col_child_updated)); } } } From 82b5a26b6b6ca537c0eee35168721a3cd2747464 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 14 Sep 2021 14:46:30 -0700 Subject: [PATCH 19/79] set STRUCT_AGGS to CORRELATION --- python/cudf/cudf/_lib/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 153b116cd33..a6ea631de82 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} _LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = set() +_STRUCT_AGGS = {'CORRELATION'} _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} From e692053241c68893d20f97e6abaca4639747dfb2 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 15 Sep 2021 04:40:38 +0530 Subject: [PATCH 20/79] for empty column, ignore child pointers in shallow_hash --- cpp/src/column/column_view.cpp | 54 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 4447273d7ff..b0363000213 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -80,28 +80,44 @@ size_type column_view_base::null_count(size_type begin, size_type end) const // simple prime number multiplication algorithm. // Adapted from http://myeyesareblind.com/2017/02/06/Combine-hash-values/#apachecommons -constexpr void combine_hash(std::size_t& h1, std::size_t h2) { h1 = h1 * 127 + h2; } +constexpr std::size_t combine_hash(std::size_t h1, std::size_t h2) { return h1 * 127 + h2; } +// 32/64-bit boost hash_combine https://stackoverflow.com/a/4948967/1550940 +constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) +{ + constexpr std::size_t const magic = sizeof(std::size_t) == 8 ? 0x9e3779b97f4a7c15 : 0x9e3779b9; + lhs ^= rhs + magic + (lhs << 6) + (lhs >> 2); + return lhs; +} + +// Struct to use custom combine hash and fold expression +struct HashValue { + std::size_t hash; + HashValue(std::size_t h) : hash{h} {} + HashValue operator^(HashValue const& other) const + { + return HashValue{combine_hash(hash, other.hash)}; + } +}; + +template +constexpr auto hash(Ts&&... ts) +{ + return (... ^ HashValue(std::hash{}(ts))).hash; +} struct shallow_hash_impl { - std::size_t operator()(column_view const& input, bool is_parent_empty = false) + std::size_t operator()(column_view const& c, bool is_parent_empty = false) { - std::size_t hash = 0; - combine_hash(hash, std::hash{}(input.type())); - combine_hash(hash, std::hash{}(input.size())); - if (not(input.is_empty() or is_parent_empty)) { - combine_hash(hash, std::hash{}(input.head())); - combine_hash(hash, std::hash{}(input.null_mask())); - combine_hash(hash, std::hash{}(input.offset())); - } - hash = std::accumulate( - input.child_begin(), - input.child_end(), - hash, - [&input, is_parent_empty](std::size_t hash, auto const& child) { - combine_hash(hash, shallow_hash_impl{}(child, input.is_empty() or is_parent_empty)); - return hash; - }); - return hash; + std::size_t const init = (c.is_empty() or is_parent_empty) + ? hash(c.type(), c.size()) + : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset()); + return std::accumulate(c.child_begin(), + c.child_end(), + init, + [&c, is_parent_empty](std::size_t hash, auto const& child) { + return combine_hash( + hash, shallow_hash_impl{}(child, c.is_empty() or is_parent_empty)); + }); } }; From 44372bcb35d27e4fff3c1fef58e8d2b4fed10feb Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 15 Sep 2021 04:43:11 +0530 Subject: [PATCH 21/79] rename is_shallow_equal to is_shallow_equivalent --- cpp/include/cudf/column/column_view.hpp | 4 +- cpp/src/column/column_view.cpp | 2 +- cpp/tests/column/column_view_shallow_test.cpp | 64 +++++++++---------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 25d13d04207..546f91a30a3 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -638,7 +638,7 @@ namespace detail { * @brief Computes a hash value from the shallow state of the specified column * * Two `column_view`s, `c1` and `c2`, that view the exact same physical column will produce equal - * `shallow_hash()` values, i.e., `is_shallow_equal(c0, c1)` implies `shallow_hash(c0) == + * `shallow_hash()` values, i.e., `is_shallow_equivalent(c0, c1)` implies `shallow_hash(c0) == * shallow_hash(c1)`. * * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., @@ -667,6 +667,6 @@ std::size_t shallow_hash(column_view const& input); * @param rhs The right `column_view` to compare * @return true if the shallow states of the two column views are equal */ -bool is_shallow_equal(column_view const& lhs, column_view const& rhs); +bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs); } // namespace detail } // namespace cudf diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index b0363000213..2464a9eeee6 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -140,7 +140,7 @@ struct shallow_equal_impl { }); } }; -bool is_shallow_equal(column_view const& lhs, column_view const& rhs) +bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs) { return shallow_equal_impl{}(lhs, rhs); } diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index 6858911c54f..f76f682bb2f 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -272,42 +272,42 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable) } } -TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_basic) +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_basic) { using namespace cudf::detail; auto col = example_column(); auto col_view = cudf::column_view{*col}; // same = same hash { - EXPECT_TRUE(is_shallow_equal(col_view, col_view)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view)); } // copy column_view = same hash { auto col_view_copy = col_view; - EXPECT_TRUE(is_shallow_equal(col_view, col_view_copy)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_copy)); } // new column_view from column = same hash { auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); } // copy column = diff hash { auto col_new = std::make_unique(*col); auto col_view_copy = col_new->view(); - EXPECT_FALSE(is_shallow_equal(col_view, col_view_copy)); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_copy)); } // column_view, diff column = diff hash. { auto col_diff = example_column(); auto col_view_diff = cudf::column_view{*col_diff}; - EXPECT_FALSE(is_shallow_equal(col_view, col_view_diff)); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_diff)); } } -TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_update_data) +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data) { using namespace cudf::detail; auto col = example_column(); @@ -325,37 +325,37 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_update_data) cudf::set_null_mask(data, 2, 64, true); } auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); } // add null_mask + new column_view = diff hash. { col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID)); auto col_view_new = cudf::column_view{*col}; - EXPECT_FALSE(is_shallow_equal(col_view, col_view_new)); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new)); col_view_new.null_count(); - EXPECT_FALSE(is_shallow_equal(col_view, col_view_new)); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new)); auto col_view_new2 = cudf::column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_view_new, col_view_new2)); + EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2)); } col_view = cudf::column_view{*col}; // updating after adding null_mask // update nulls + new column_view = same hash. { cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false); auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); } // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT) { col->set_null_count(cudf::UNKNOWN_NULL_COUNT); auto col_view_new = cudf::column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_view, col_view_new)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new)); col->set_null_count(col->size()); auto col_view_new2 = cudf::column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_view, col_view_new2)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new2)); } } -TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_slice) +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_slice) { using namespace cudf::detail; auto col = example_column(); @@ -363,18 +363,18 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_slice) // column_view, sliced[0, size) = same hash (for split too) { auto col_sliced = cudf::slice(col_view, {0, col_view.size()}); - EXPECT_TRUE(is_shallow_equal(col_view, col_sliced[0])); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_sliced[0])); auto col_split = cudf::split(col_view, {0}); - EXPECT_FALSE(is_shallow_equal(col_view, col_split[0])); - EXPECT_TRUE(is_shallow_equal(col_view, col_split[1])); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0])); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_split[1])); } // column_view, sliced[n:] = diff hash (for split too) { auto col_sliced = cudf::slice(col_view, {1, col_view.size()}); - EXPECT_FALSE(is_shallow_equal(col_view, col_sliced[0])); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_sliced[0])); auto col_split = cudf::split(col_view, {1}); - EXPECT_FALSE(is_shallow_equal(col_view, col_split[0])); - EXPECT_FALSE(is_shallow_equal(col_view, col_split[1])); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0])); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[1])); } // column_view, col copy sliced[0, 0) = same hash (empty column) { @@ -383,11 +383,11 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_slice) auto col_sliced = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()}); - EXPECT_TRUE(is_shallow_equal(col_sliced[0], col_sliced[1])); - EXPECT_TRUE(is_shallow_equal(col_sliced[1], col_sliced[2])); - EXPECT_TRUE(is_shallow_equal(col_sliced[0], col_new_sliced[0])); - EXPECT_TRUE(is_shallow_equal(col_sliced[1], col_new_sliced[1])); - EXPECT_TRUE(is_shallow_equal(col_sliced[2], col_new_sliced[2])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_sliced[1])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_sliced[2])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_new_sliced[0])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_new_sliced[1])); + EXPECT_TRUE(is_shallow_equivalent(col_sliced[2], col_new_sliced[2])); } // column_view, bit_cast = diff hash @@ -398,12 +398,12 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_slice) std::make_signed_t>; auto new_type = cudf::data_type(cudf::type_to_id()); auto col_bitcast = cudf::bit_cast(col_view, new_type); - EXPECT_FALSE(is_shallow_equal(col_view, col_bitcast)); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_bitcast)); } } } -TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_mutable) +TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_mutable) { using namespace cudf::detail; auto col = example_column(); @@ -411,7 +411,7 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_mutable) // mutable_column_view, column_view = same hash { auto col_mutable = cudf::mutable_column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_mutable, col_view)); + EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_view)); } // mutable_column_view, modified mutable_column_view = same hash // update the children column data = same hash @@ -426,9 +426,9 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_mutable) auto data = reinterpret_cast(col->child(0).mutable_view().head()); cudf::set_null_mask(data, 1, 32, false); } - EXPECT_TRUE(is_shallow_equal(col_view, col_mutable)); + EXPECT_TRUE(is_shallow_equivalent(col_view, col_mutable)); auto col_mutable_new = cudf::mutable_column_view{*col}; - EXPECT_TRUE(is_shallow_equal(col_mutable, col_mutable_new)); + EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_mutable_new)); } // update the children column_views = diff hash { @@ -436,7 +436,7 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equal_mutable) col->child(0).set_null_mask( cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL)); auto col_child_updated = cudf::mutable_column_view{*col}; - EXPECT_FALSE(is_shallow_equal(col_view, col_child_updated)); + EXPECT_FALSE(is_shallow_equivalent(col_view, col_child_updated)); } } } From 3aab04fd20a78ba543265e572d4c9507ee27b59e Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 15 Sep 2021 16:08:52 -0700 Subject: [PATCH 22/79] added ctypedef correlation_type. need to add tests --- python/cudf/cudf/_lib/aggregation.pyx | 10 +++++++--- python/cudf/cudf/_lib/cpp/aggregation.pxd | 8 +++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 3557b505e81..0c594d302f8 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -57,7 +57,7 @@ class AggregationKind(Enum): UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA - + CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION cdef class Aggregation: """A Cython wrapper for aggregations. @@ -325,7 +325,9 @@ cdef class Aggregation: def correlation(cls): cdef Aggregation agg = cls() agg.c_obj = move( - libcudf_aggregation.make_correlation_aggregation[aggregation]()) + libcudf_aggregation.make_correlation_aggregation[aggregation]( + libcudf_aggregation.correlation_type.PEARSON + )) return agg cdef class RollingAggregation: @@ -704,7 +706,9 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_correlation_aggregation[groupby_aggregation]()) + make_correlation_aggregation[groupby_aggregation]( + libcudf_aggregation.correlation_type.PEARSON + )) return agg cdef class GroupbyScanAggregation: diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index db4c5f023a6..04deeb877d1 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -55,6 +55,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' + ctypedef enum correlation_type: + PEARSON 'cudf::correlation_type::PEARSON' + KENDALL 'cudf::correlation_type::KENDALL' + SPEARMAN 'cudf::correlation_type::SPEARMAN' + cdef unique_ptr[T] make_sum_aggregation[T]() except + cdef unique_ptr[T] make_product_aggregation[T]() except + @@ -109,4 +114,5 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: string user_defined_aggregator, data_type output_type) except + - cdef unique_ptr[T] make_correlation_aggregation[T]() except + + cdef unique_ptr[T] make_correlation_aggregation[T]( + correlation_type type) except + From ecc3a7d436c5b054674d3f81054db61a5c0fbbbf Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 16 Sep 2021 18:04:41 +0530 Subject: [PATCH 23/79] use hash_combine for shallow hash --- cpp/include/cudf/types.hpp | 21 ++++++++++++++++++--- cpp/src/column/column_view.cpp | 18 ++++++------------ 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index b417148b1a4..2afc220162c 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include /** * @file @@ -328,6 +328,21 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh */ std::size_t size_of(data_type t); +/** + * @brief Combines two hashed values into a single hashed value. + * + * Adapted from boost hash_combine, modified for 32/64-bit + * https://stackoverflow.com/a/4948967/1550940 + * @param lhs The first hashed value + * @param rhs The second hashed value + * @return Combined hash value + */ +constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) +{ + constexpr std::size_t const magic = sizeof(std::size_t) == 8 ? 0x9e3779b97f4a7c15 : 0x9e3779b9; + lhs ^= rhs + magic + (lhs << 6) + (lhs >> 2); + return lhs; +} /** * @brief Identifies the hash function to be used */ @@ -353,8 +368,8 @@ template <> struct hash { std::size_t operator()(cudf::data_type const& type) const noexcept { - return std::hash{}(static_cast(type.id())) * 127 + - std::hash{}(type.scale()); + return cudf::hash_combine(std::hash{}(static_cast(type.id())), + std::hash{}(type.scale())); } }; } // namespace std diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 2464a9eeee6..67aad27f951 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -78,24 +78,18 @@ size_type column_view_base::null_count(size_type begin, size_type end) const : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end); } +// Alternative fast hash functions // simple prime number multiplication algorithm. // Adapted from http://myeyesareblind.com/2017/02/06/Combine-hash-values/#apachecommons -constexpr std::size_t combine_hash(std::size_t h1, std::size_t h2) { return h1 * 127 + h2; } -// 32/64-bit boost hash_combine https://stackoverflow.com/a/4948967/1550940 -constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) -{ - constexpr std::size_t const magic = sizeof(std::size_t) == 8 ? 0x9e3779b97f4a7c15 : 0x9e3779b9; - lhs ^= rhs + magic + (lhs << 6) + (lhs >> 2); - return lhs; -} +// constexpr std::size_t combine_hash(std::size_t h1, std::size_t h2) { return h1 * 127 + h2; } -// Struct to use custom combine hash and fold expression +// Struct to use custom hash combine and fold expression struct HashValue { std::size_t hash; - HashValue(std::size_t h) : hash{h} {} + explicit HashValue(std::size_t h) : hash{h} {} HashValue operator^(HashValue const& other) const { - return HashValue{combine_hash(hash, other.hash)}; + return HashValue{hash_combine(hash, other.hash)}; } }; @@ -115,7 +109,7 @@ struct shallow_hash_impl { c.child_end(), init, [&c, is_parent_empty](std::size_t hash, auto const& child) { - return combine_hash( + return hash_combine( hash, shallow_hash_impl{}(child, c.is_empty() or is_parent_empty)); }); } From d2cd4681975f5d87b67ba8325c2af038d0b27a40 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 17 Sep 2021 04:08:02 +0530 Subject: [PATCH 24/79] Apply suggestions from code review (jake) Co-authored-by: Jake Hemstad --- cpp/include/cudf/column/column_view.hpp | 27 +++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 546f91a30a3..c6ba9fed1cc 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -637,9 +637,7 @@ namespace detail { /** * @brief Computes a hash value from the shallow state of the specified column * - * Two `column_view`s, `c1` and `c2`, that view the exact same physical column will produce equal - * `shallow_hash()` values, i.e., `is_shallow_equivalent(c0, c1)` implies `shallow_hash(c0) == - * shallow_hash(c1)`. + * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) == shallow_hash(c1)`. * * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., * it is independent of the number of elements in the column. @@ -648,24 +646,27 @@ namespace detail { * any kernels. * * @param input The `column_view` to compute hash - * @return The hash value + * @return The hash value derived from the shallow state of `input`. */ std::size_t shallow_hash(column_view const& input); /** - * @brief Equality operator for column views based on the shallow state of the column view. + * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns * - * Only shallow states used for the hash computation are: type, size, data pointer, null_mask - * pointer, offset and the column_view of the children recursively. Note that `null_count` is not - * used. - * - * Note: This equality function will consider a column not equal to a copy of the same column with - * exactly same contents. It is guarenteed to return true for same column_view only, even if the - * underlying data changes. + * Two columns are equivalent if for any operation `F` then: + * ``` + * is_shallow_equivalent(c0, c1) ==> is_shallow_equivalent(F(c0),F(c1)) + * ``` + * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact same physical column. In other words, two physically independent columns may have exactly equivalent elements but their shallow state would not be equivalent. + * + * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`, i.e., it is independent of the number of elements in either column. * + * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor launch + * any kernels. + * * @param lhs The left `column_view` to compare * @param rhs The right `column_view` to compare - * @return true if the shallow states of the two column views are equal + * @return If `lhs` and `rhs` have equivalent shallow state */ bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs); } // namespace detail From fa40847cd36c79fdb1fc4799ebd300e78b520438 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 17 Sep 2021 10:40:42 +0530 Subject: [PATCH 25/79] address review comments --- cpp/include/cudf/column/column_view.hpp | 22 +++++---- cpp/include/cudf/types.hpp | 7 +-- cpp/src/column/column_view.cpp | 66 ++++++++++++------------- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index c6ba9fed1cc..f4ca4404430 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -637,7 +637,8 @@ namespace detail { /** * @brief Computes a hash value from the shallow state of the specified column * - * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) == shallow_hash(c1)`. + * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) == + * shallow_hash(c1)`. * * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e., * it is independent of the number of elements in the column. @@ -646,24 +647,27 @@ namespace detail { * any kernels. * * @param input The `column_view` to compute hash - * @return The hash value derived from the shallow state of `input`. + * @return The hash value derived from the shallow state of `input`. */ std::size_t shallow_hash(column_view const& input); /** - * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns + * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns * * Two columns are equivalent if for any operation `F` then: * ``` * is_shallow_equivalent(c0, c1) ==> is_shallow_equivalent(F(c0),F(c1)) * ``` - * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact same physical column. In other words, two physically independent columns may have exactly equivalent elements but their shallow state would not be equivalent. - * - * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`, i.e., it is independent of the number of elements in either column. + * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact + * same physical column. In other words, two physically independent columns may have exactly + * equivalent elements but their shallow state would not be equivalent. + * + * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`, + * i.e., it is independent of the number of elements in either column. + * + * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor + * launch any kernels. * - * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor launch - * any kernels. - * * @param lhs The left `column_view` to compare * @param rhs The right `column_view` to compare * @return If `lhs` and `rhs` have equivalent shallow state diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 2afc220162c..6926683b401 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -331,16 +331,17 @@ std::size_t size_of(data_type t); /** * @brief Combines two hashed values into a single hashed value. * - * Adapted from boost hash_combine, modified for 32/64-bit + * Adapted from Boost hash_combine function, modified for 64-bit + * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html * https://stackoverflow.com/a/4948967/1550940 + * * @param lhs The first hashed value * @param rhs The second hashed value * @return Combined hash value */ constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) { - constexpr std::size_t const magic = sizeof(std::size_t) == 8 ? 0x9e3779b97f4a7c15 : 0x9e3779b9; - lhs ^= rhs + magic + (lhs << 6) + (lhs >> 2); + lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2); return lhs; } /** diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 67aad27f951..525da2afe73 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -99,44 +100,41 @@ constexpr auto hash(Ts&&... ts) return (... ^ HashValue(std::hash{}(ts))).hash; } -struct shallow_hash_impl { - std::size_t operator()(column_view const& c, bool is_parent_empty = false) - { - std::size_t const init = (c.is_empty() or is_parent_empty) - ? hash(c.type(), c.size()) - : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset()); - return std::accumulate(c.child_begin(), - c.child_end(), - init, - [&c, is_parent_empty](std::size_t hash, auto const& child) { - return hash_combine( - hash, shallow_hash_impl{}(child, c.is_empty() or is_parent_empty)); - }); - } -}; +std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false) +{ + std::size_t const init = (c.is_empty() or is_parent_empty) + ? hash(c.type(), c.size()) + : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset()); + return std::accumulate(c.child_begin(), + c.child_end(), + init, + [&c, is_parent_empty](std::size_t hash, auto const& child) { + return hash_combine( + hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty)); + }); +} -std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl{}(input); } +std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); } -struct shallow_equal_impl { - bool operator()(column_view const& lhs, column_view const& rhs, bool is_parent_empty = false) - { - bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty; - return (lhs.type() == rhs.type()) and - (is_empty or - ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and - (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and - std::equal(lhs.child_begin(), - lhs.child_end(), - rhs.child_begin(), - rhs.child_end(), - [is_empty](auto const& lhs_child, auto const& rhs_child) { - return shallow_equal_impl{}(lhs_child, rhs_child, is_empty); - }); - } -}; +bool shallow_equal_impl(column_view const& lhs, + column_view const& rhs, + bool is_parent_empty = false) +{ + bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty; + return (lhs.type() == rhs.type()) and + (is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and + (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and + std::equal(lhs.child_begin(), + lhs.child_end(), + rhs.child_begin(), + rhs.child_end(), + [is_empty](auto const& lhs_child, auto const& rhs_child) { + return shallow_equal_impl(lhs_child, rhs_child, is_empty); + }); +} bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs) { - return shallow_equal_impl{}(lhs, rhs); + return shallow_equal_impl(lhs, rhs); } } // namespace detail From 6ac572522d48545851f2869ee3ec7cd5094a5226 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 17 Sep 2021 10:58:01 +0530 Subject: [PATCH 26/79] update after PR #9185 updates --- cpp/include/cudf/detail/aggregation/result_cache.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index a15e15d7d01..6ada327d107 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -27,14 +28,14 @@ struct pair_column_aggregation_equal_to { bool operator()(std::pair const& lhs, std::pair const& rhs) const { - return is_shallow_equal(lhs.first, rhs.first) and lhs.second.is_equal(rhs.second); + return is_shallow_equivalent(lhs.first, rhs.first) and lhs.second.is_equal(rhs.second); } }; struct pair_column_aggregation_hash { size_t operator()(std::pair const& key) const noexcept { - return shallow_hash(key.first) * 127 + key.second.do_hash(); + return hash_combine(shallow_hash(key.first), key.second.do_hash()); } }; From e36b834dd93c3184a7174c40b30020767d9bfa2f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 13:50:59 +0530 Subject: [PATCH 27/79] add boost license for hash_combine, move to diff header --- cpp/include/cudf/detail/hashing.hpp | 35 +++++++++++++++++++ .../cudf/detail/utilities/hash_functions.cuh | 12 +++++++ cpp/include/cudf/types.hpp | 29 --------------- cpp/src/column/column_view.cpp | 6 +--- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index 83d6be14709..22acf15fbf8 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -19,6 +19,8 @@ #include +#include + namespace cudf { namespace detail { @@ -53,5 +55,38 @@ std::unique_ptr serial_murmur_hash3_32( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/* Copyright 2005-2014 Daniel James. + * + * Use, modification and distribution is subject to the Boost Software + * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +/** + * @brief Combines two hashed values into a single hashed value. + * + * Adapted from Boost hash_combine function, modified for 64-bit + * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html + * + * @param lhs The first hashed value + * @param rhs The second hashed value + * @return Combined hash value + */ +constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) +{ + lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2); + return lhs; +} } // namespace detail } // namespace cudf + +// specialization of std::hash for cudf::data_type +namespace std { +template <> +struct hash { + std::size_t operator()(cudf::data_type const& type) const noexcept + { + return cudf::detail::hash_combine(std::hash{}(static_cast(type.id())), + std::hash{}(type.scale())); + } +}; +} // namespace std diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 6eab13ae9af..65deadd6cd0 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -395,6 +395,12 @@ struct MurmurHash3_32 { return h; } + /* Copyright 2005-2014 Daniel James. + * + * Use, modification and distribution is subject to the Boost Software + * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ /** * @brief Combines two hash values into a new single hash value. Called * repeatedly to create a hash value from several variables. @@ -795,6 +801,12 @@ struct IdentityHash { IdentityHash() = default; constexpr IdentityHash(uint32_t seed) : m_seed(seed) {} + /* Copyright 2005-2014 Daniel James. + * + * Use, modification and distribution is subject to the Boost Software + * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ /** * @brief Combines two hash values into a new single hash value. Called * repeatedly to create a hash value from several variables. diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 6926683b401..e1037efb5c8 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -27,7 +27,6 @@ #include #include #include -#include #include /** @@ -328,22 +327,6 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh */ std::size_t size_of(data_type t); -/** - * @brief Combines two hashed values into a single hashed value. - * - * Adapted from Boost hash_combine function, modified for 64-bit - * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html - * https://stackoverflow.com/a/4948967/1550940 - * - * @param lhs The first hashed value - * @param rhs The second hashed value - * @return Combined hash value - */ -constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) -{ - lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2); - return lhs; -} /** * @brief Identifies the hash function to be used */ @@ -362,15 +345,3 @@ static constexpr uint32_t DEFAULT_HASH_SEED = 0; /** @} */ } // namespace cudf - -// specialization of std::hash for cudf::data_type -namespace std { -template <> -struct hash { - std::size_t operator()(cudf::data_type const& type) const noexcept - { - return cudf::hash_combine(std::hash{}(static_cast(type.id())), - std::hash{}(type.scale())); - } -}; -} // namespace std diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 525da2afe73..d214c507ca5 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -79,11 +80,6 @@ size_type column_view_base::null_count(size_type begin, size_type end) const : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end); } -// Alternative fast hash functions -// simple prime number multiplication algorithm. -// Adapted from http://myeyesareblind.com/2017/02/06/Combine-hash-values/#apachecommons -// constexpr std::size_t combine_hash(std::size_t h1, std::size_t h2) { return h1 * 127 + h2; } - // Struct to use custom hash combine and fold expression struct HashValue { std::size_t hash; From 1fbe3fc688073062292c4825d1cd9ac116dad181 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Sat, 18 Sep 2021 13:53:21 +0530 Subject: [PATCH 28/79] Apply suggestions from code review (jake) Co-authored-by: Jake Hemstad --- cpp/include/cudf/column/column_view.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index f4ca4404430..cd490c3c832 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -656,7 +656,7 @@ std::size_t shallow_hash(column_view const& input); * * Two columns are equivalent if for any operation `F` then: * ``` - * is_shallow_equivalent(c0, c1) ==> is_shallow_equivalent(F(c0),F(c1)) + * is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent * ``` * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact * same physical column. In other words, two physically independent columns may have exactly From fc3cc6b538b60b7549de6ad08cceaa293098778b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 13:59:26 +0530 Subject: [PATCH 29/79] include cleanup --- cpp/include/cudf/detail/hashing.hpp | 1 + cpp/src/column/column_view.cpp | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index 22acf15fbf8..bd5c8a42a51 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -19,6 +19,7 @@ #include +#include #include namespace cudf { diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index d214c507ca5..d8132b4f545 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -24,7 +24,6 @@ #include #include -#include #include #include #include From f7b6bb637d9d3f4bd5c400610af40a06322cb511 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 14:06:20 +0530 Subject: [PATCH 30/79] add missing include due to reorg --- cpp/include/cudf/detail/aggregation/result_cache.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index 6ada327d107..170960ba56d 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include From 7db9870d82392c88faa0a1942558bec46fbdfa19 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 15:04:25 +0530 Subject: [PATCH 31/79] update groupby corr to use hashed result cache --- cpp/src/groupby/sort/aggregate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 3ce54c7996a..a54ef6e0a98 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -532,9 +532,9 @@ void aggregate_result_functor::operator()(aggregation con template <> void aggregate_result_functor::operator()(aggregation const& agg) { - if (cache.has_result(col_idx, agg)) { return; } + if (cache.has_result(values, agg)) { return; } - cache.add_result(col_idx, + cache.add_result(values, agg, detail::group_corr(get_grouped_values(), helper.group_offsets(stream), From 5bb1dc460d621db73cc6121b294029108c001840 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 23:50:48 +0530 Subject: [PATCH 32/79] Revert "set STRUCT_AGGS to CORRELATION" This reverts commit 82b5a26b6b6ca537c0eee35168721a3cd2747464. --- python/cudf/cudf/_lib/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 3d9c39ae2fc..19ef6555a6e 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} _LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = {'CORRELATION'} +_STRUCT_AGGS = set() _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} From fb98fd551023bb314dbf1149433e3b5c5dfbad83 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 23:50:59 +0530 Subject: [PATCH 33/79] Revert "added ctypedef correlation_type. need to add tests" This reverts commit 3aab04fd20a78ba543265e572d4c9507ee27b59e. --- python/cudf/cudf/_lib/aggregation.pyx | 10 +++------- python/cudf/cudf/_lib/cpp/aggregation.pxd | 8 +------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 0c594d302f8..3557b505e81 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -57,7 +57,7 @@ class AggregationKind(Enum): UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA - CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION + cdef class Aggregation: """A Cython wrapper for aggregations. @@ -325,9 +325,7 @@ cdef class Aggregation: def correlation(cls): cdef Aggregation agg = cls() agg.c_obj = move( - libcudf_aggregation.make_correlation_aggregation[aggregation]( - libcudf_aggregation.correlation_type.PEARSON - )) + libcudf_aggregation.make_correlation_aggregation[aggregation]()) return agg cdef class RollingAggregation: @@ -706,9 +704,7 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_correlation_aggregation[groupby_aggregation]( - libcudf_aggregation.correlation_type.PEARSON - )) + make_correlation_aggregation[groupby_aggregation]()) return agg cdef class GroupbyScanAggregation: diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 04deeb877d1..db4c5f023a6 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -55,11 +55,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' - ctypedef enum correlation_type: - PEARSON 'cudf::correlation_type::PEARSON' - KENDALL 'cudf::correlation_type::KENDALL' - SPEARMAN 'cudf::correlation_type::SPEARMAN' - cdef unique_ptr[T] make_sum_aggregation[T]() except + cdef unique_ptr[T] make_product_aggregation[T]() except + @@ -114,5 +109,4 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: string user_defined_aggregator, data_type output_type) except + - cdef unique_ptr[T] make_correlation_aggregation[T]( - correlation_type type) except + + cdef unique_ptr[T] make_correlation_aggregation[T]() except + From 324c37dcfc0f2b3279ba075727fc8563af05205f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 18 Sep 2021 23:57:08 +0530 Subject: [PATCH 34/79] Revert "added definition of correlation() in cython" This reverts commit d96f870309d4c1cb0e7b93f4b32cfdfff543313c. --- Untitled.ipynb | 33 ----------------------- python/cudf/cudf/_lib/aggregation.pyx | 15 ----------- python/cudf/cudf/_lib/cpp/aggregation.pxd | 4 --- 3 files changed, 52 deletions(-) delete mode 100644 Untitled.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index e38548d42a9..00000000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "06d9628c-d48e-40cb-a90b-ab83ce92af3b", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 3557b505e81..4f703724cef 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -321,13 +321,6 @@ cdef class Aggregation: )) return agg - @classmethod - def correlation(cls): - cdef Aggregation agg = cls() - agg.c_obj = move( - libcudf_aggregation.make_correlation_aggregation[aggregation]()) - return agg - cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -699,14 +692,6 @@ cdef class GroupbyAggregation: ) return agg - @classmethod - def correlation(cls): - cdef GroupbyAggregation agg = cls() - agg.c_obj = move( - libcudf_aggregation. - make_correlation_aggregation[groupby_aggregation]()) - return agg - cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index db4c5f023a6..13bfa49057c 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -38,8 +38,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' - CORRELATION 'cudf::aggregation::CORRELATION' - Kind kind cdef cppclass rolling_aggregation: @@ -108,5 +106,3 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: udf_type type, string user_defined_aggregator, data_type output_type) except + - - cdef unique_ptr[T] make_correlation_aggregation[T]() except + From 9f19ddfb468fd345df0aef560dd19bf195884a27 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 20 Sep 2021 10:09:24 +0530 Subject: [PATCH 35/79] Apply suggestions from code review (jake) Co-authored-by: Jake Hemstad --- cpp/src/aggregation/result_cache.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp index 04750f7fa98..2b6359f20e8 100644 --- a/cpp/src/aggregation/result_cache.cpp +++ b/cpp/src/aggregation/result_cache.cpp @@ -38,18 +38,18 @@ void result_cache::add_result(column_view const& input, column_view result_cache::get_result(column_view const& input, aggregation const& agg) const { - CUDF_EXPECTS(has_result(input, agg), "Result does not exist in cache"); auto result_it = _cache.find({input, agg}); + CUDF_EXPECTS(result_it != _cache.end(), "Result does not exist in cache"); return result_it->second.second->view(); } std::unique_ptr result_cache::release_result(column_view const& input, aggregation const& agg) { - CUDF_EXPECTS(has_result(input, agg), "Result does not exist in cache"); - auto result_it = _cache.extract({input, agg}); + auto node = _cache.extract({input, agg}); + CUDF_EXPECTS(not node.empty(), "Result does not exist in cache"); return std::move(result_it.mapped().second); } From ab955bb2ffb99e7223f787f2091385d45a4068ec Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 20 Sep 2021 11:45:07 +0530 Subject: [PATCH 36/79] enable result caching of child columns in correlation --- cpp/src/groupby/sort/aggregate.cpp | 29 ++++++- cpp/src/groupby/sort/group_corr.cu | 92 ++++------------------- cpp/src/groupby/sort/group_reductions.hpp | 14 +++- 3 files changed, 56 insertions(+), 79 deletions(-) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index a54ef6e0a98..c7954db5d75 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -533,13 +533,40 @@ template <> void aggregate_result_functor::operator()(aggregation const& agg) { if (cache.has_result(values, agg)) { return; } + CUDF_EXPECTS(values.type().id() == type_id::STRUCT, + "Input to `group_corr` must be a structs column."); + CUDF_EXPECTS(values.num_children() == 2, + "Input to `group_corr` must be a structs column having 2 children columns."); + CUDF_EXPECTS(values.nullable() == false, + "Input to `group_corr` must be a non-nullable structs column."); + + auto const& corr_agg = dynamic_cast(agg); + CUDF_EXPECTS(corr_agg._type == correlation_type::PEARSON, + "Only Pearson correlation is supported."); + + auto std_agg = make_std_aggregation(); + cudf::detail::aggregation_dispatcher( + std_agg->kind, aggregate_result_functor(values.child(0), helper, cache, stream, mr), *std_agg); + cudf::detail::aggregation_dispatcher( + std_agg->kind, aggregate_result_functor(values.child(1), helper, cache, stream, mr), *std_agg); + + auto const stddev0 = cache.get_result(values.child(0), *std_agg); + auto const stddev1 = cache.get_result(values.child(1), *std_agg); + auto mean_agg = make_mean_aggregation(); + auto const mean0 = cache.get_result(values.child(0), *mean_agg); + auto const mean1 = cache.get_result(values.child(1), *mean_agg); cache.add_result(values, agg, - detail::group_corr(get_grouped_values(), + detail::group_corr(get_grouped_values().child(0), + get_grouped_values().child(1), helper.group_offsets(stream), helper.group_labels(stream), helper.num_groups(stream), + mean0, + mean1, + stddev0, + stddev1, stream, mr)); }; diff --git a/cpp/src/groupby/sort/group_corr.cu b/cpp/src/groupby/sort/group_corr.cu index 35f29a1bb59..5c47676d4f9 100644 --- a/cpp/src/groupby/sort/group_corr.cu +++ b/cpp/src/groupby/sort/group_corr.cu @@ -104,101 +104,42 @@ struct corr_transform { // : thrust::unary_function return (x - xmean) * (y - ymean) / (group_size - ddof) / xstddev / ystddev; } }; - -/* -sum((x-xu)*(y-yu)) -transform_output_iterator /N-1, stdx, stdy how do you know the indices? we can not. -So, -(x-xu)*(y-yu))/N-1/stdx/stdy as single iterator., then reduce_by_key. -very similar to var_transform in group_std. -*/ - -std::tuple, std::unique_ptr> group_mean_stddev( - column_view const& values_0, - cudf::device_span group_offsets, - cudf::device_span group_labels, - size_type num_groups, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto sum1 = detail::group_sum(values_0, num_groups, group_labels, stream, mr); - auto count1 = values_0.nullable() - ? detail::group_count_valid(values_0, group_labels, num_groups, stream, mr) - : detail::group_count_all(group_offsets, num_groups, stream, mr); - auto mean1 = - cudf::detail::binary_operation(*sum1, - *count1, - binary_operator::DIV, - cudf::detail::target_type(values_0.type(), aggregation::MEAN), - stream, - mr); - - auto var1 = detail::group_var(values_0, - *mean1, - *count1, - group_labels, - 1, // default var_agg._ddof, - stream, - mr); - auto stddev1 = cudf::detail::unary_operation(*var1, unary_operator::SQRT, stream, mr); - return std::make_tuple(std::move(mean1), std::move(stddev1)); -} - } // namespace // TODO Eventually this function should accept values_0, values_1, not a struct. -std::unique_ptr group_corr(column_view const& values, +std::unique_ptr group_corr(column_view const& values_0, + column_view const& values_1, cudf::device_span group_offsets, cudf::device_span group_labels, size_type num_groups, + column_view const& mean_0, + column_view const& mean_1, + column_view const& stddev_0, + column_view const& stddev_1, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(values.type().id() == type_id::STRUCT, - "Input to `group_corr` must be a structs column."); - CUDF_EXPECTS(values.num_children() == 2, - "Input to `group_corr` must be a structs column having 2 children columns."); - CUDF_EXPECTS(values.nullable() == false, - "Input to `group_corr` must be a non-nullable structs column."); - std::cout << "size=" << values.size() << std::endl; - std::cout << "num_children=" << values.num_children() << std::endl; - using result_type = id_to_type; static_assert( std::is_same_v, result_type>); // check if each child type can be converted to float64. - bool const is_convertible = - std::all_of(values.child_begin(), values.child_end(), [](auto const& c) { - return type_dispatcher(c.type(), is_double_convertible_impl{}); - }); + bool const is_convertible = type_dispatcher(values_0.type(), is_double_convertible_impl{}) or + type_dispatcher(values_1.type(), is_double_convertible_impl{}); + CUDF_EXPECTS(is_convertible, - "Input to `group_corr` must be a structs column having all children columns of type " - "convertible to float64."); + "Input to `group_corr` must be columns of type convertible to float64."); - // TODO calculate SUM // TODO calculate COUNT_VALID (need to do for 2 seperately. for MEAN, and // bitmask_and->COUNT_VALID for CORR.) - // TODO calculate MEAN - // TODO calculate VARIANCE - // TODO calculate STDDEV // TODO calculate CORR. (requires MEAN1, MEAN2, COUNT_VALID_ANDed, STDDEV1, STDDEV2) // TODO shuffle. - auto const& values_0 = values.child(0); - auto const& values_1 = values.child(1); - // TODO fix caching of child sum, count_valid, mean, variance, stddev. [unsupported due to - // result_cache design] - auto [mean0, stddev0] = - group_mean_stddev(values_0, group_offsets, group_labels, num_groups, stream, mr); - auto [mean1, stddev1] = - group_mean_stddev(values_1, group_offsets, group_labels, num_groups, stream, mr); - - auto mean0_ptr = mean0->mutable_view().begin(); - auto mean1_ptr = mean1->mutable_view().begin(); - auto stddev0_ptr = stddev0->mutable_view().begin(); - auto stddev1_ptr = stddev1->mutable_view().begin(); + auto mean0_ptr = mean_0.begin(); + auto mean1_ptr = mean_1.begin(); + auto stddev0_ptr = stddev_0.begin(); + auto stddev1_ptr = stddev_1.begin(); // TODO replace with ANDed bitmask. (values, stddev) auto count1 = values_0.nullable() @@ -217,9 +158,8 @@ std::unique_ptr group_corr(column_view const& values, group_labels.begin()}; // result - auto const any_nulls = std::any_of( - values.child_begin(), values.child_end(), [](auto const& c) { return c.has_nulls(); }); - auto mask_type = any_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED; + auto const any_nulls = values_0.has_nulls() or values_1.has_nulls(); + auto mask_type = any_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED; auto result = make_numeric_column(data_type(type_to_id()), num_groups, mask_type, stream, mr); diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 6bb87d7ea6a..5bd658d8f76 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -445,17 +445,27 @@ std::unique_ptr group_merge_m2(column_view const& values, * @brief Internal API to find correlation of child columns of a non-nullable struct column. * TODO fill documentation. * - * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge. + * @param values_0 The first grouped values column to correlate + * @param values_1 The second grouped values column to correlate * @param group_offsets Offsets of groups' starting points within @p values. * @param group_labels ID of group that the corresponding value belongs to * @param num_groups Number of groups. + * @param mean_0 The mean of the first grouped values column + * @param mean_1 The mean of the second grouped values column + * @param stddev_0 The standard deviation of the first grouped values column + * @param stddev_1 The standard deviation of the second grouped values column * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr group_corr(column_view const& values, +std::unique_ptr group_corr(column_view const& values_0, + column_view const& values_1, cudf::device_span group_offsets, cudf::device_span group_labels, size_type num_groups, + column_view const& mean_0, + column_view const& mean_1, + column_view const& stddev_0, + column_view const& stddev_1, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @endinternal From 98bbc94fd4e5f572412475f7f487a7a4f42caac1 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 20 Sep 2021 13:33:46 +0530 Subject: [PATCH 37/79] fix duplicate {col, agg} request extract --- cpp/src/aggregation/result_cache.cpp | 4 +--- cpp/src/groupby/common/utils.hpp | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp index 2b6359f20e8..1889ae67ee3 100644 --- a/cpp/src/aggregation/result_cache.cpp +++ b/cpp/src/aggregation/result_cache.cpp @@ -38,7 +38,6 @@ void result_cache::add_result(column_view const& input, column_view result_cache::get_result(column_view const& input, aggregation const& agg) const { - auto result_it = _cache.find({input, agg}); CUDF_EXPECTS(result_it != _cache.end(), "Result does not exist in cache"); return result_it->second.second->view(); @@ -47,10 +46,9 @@ column_view result_cache::get_result(column_view const& input, aggregation const std::unique_ptr result_cache::release_result(column_view const& input, aggregation const& agg) { - auto node = _cache.extract({input, agg}); CUDF_EXPECTS(not node.empty(), "Result does not exist in cache"); - return std::move(result_it.mapped().second); + return std::move(node.mapped().second); } } // namespace detail diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp index 129351c3d38..27a34843cc0 100644 --- a/cpp/src/groupby/common/utils.hpp +++ b/cpp/src/groupby/common/utils.hpp @@ -19,6 +19,8 @@ #include #include #include + +#include #include namespace cudf { @@ -30,10 +32,24 @@ inline std::vector extract_results(host_span results(requests.size()); - + std::unordered_map>, + column_view, + cudf::detail::pair_column_aggregation_hash, + cudf::detail::pair_column_aggregation_equal_to> + repeated_result; for (size_t i = 0; i < requests.size(); i++) { for (auto&& agg : requests[i].aggregations) { - results[i].results.emplace_back(cache.release_result(requests[i].values, *agg)); + if (cache.has_result(requests[i].values, *agg)) { + results[i].results.emplace_back(cache.release_result(requests[i].values, *agg)); + repeated_result[{requests[i].values, *agg}] = results[i].results.back()->view(); + } else { + auto it = repeated_result.find({requests[i].values, *agg}); + if (it != repeated_result.end()) { + results[i].results.emplace_back(std::make_unique(it->second)); + } else { + CUDF_FAIL("Cannot extract result from the cache"); + } + } } } return results; From 95815250856ca02bdd5772e0b71b72547c2fccce Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 21 Sep 2021 04:38:13 +0530 Subject: [PATCH 38/79] address review comments --- cpp/src/column/column_view.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index d8132b4f545..25a1aa6f22b 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -98,7 +98,7 @@ constexpr auto hash(Ts&&... ts) std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false) { std::size_t const init = (c.is_empty() or is_parent_empty) - ? hash(c.type(), c.size()) + ? hash(c.type(), 0) : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset()); return std::accumulate(c.child_begin(), c.child_end(), @@ -111,9 +111,9 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); } -bool shallow_equal_impl(column_view const& lhs, - column_view const& rhs, - bool is_parent_empty = false) +bool shallow_equivalent_impl(column_view const& lhs, + column_view const& rhs, + bool is_parent_empty = false) { bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty; return (lhs.type() == rhs.type()) and @@ -124,12 +124,12 @@ bool shallow_equal_impl(column_view const& lhs, rhs.child_begin(), rhs.child_end(), [is_empty](auto const& lhs_child, auto const& rhs_child) { - return shallow_equal_impl(lhs_child, rhs_child, is_empty); + return shallow_equivalent_impl(lhs_child, rhs_child, is_empty); }); } bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs) { - return shallow_equal_impl(lhs, rhs); + return shallow_equivalent_impl(lhs, rhs); } } // namespace detail From 1a5f367b7edd880fd1b2789d11a5531190339ee6 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 21 Sep 2021 21:29:57 +0530 Subject: [PATCH 39/79] Update cpp/src/column/column_view.cpp Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/column/column_view.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 25a1aa6f22b..5749cb48c0e 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -97,7 +97,7 @@ constexpr auto hash(Ts&&... ts) std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false) { - std::size_t const init = (c.is_empty() or is_parent_empty) + std::size_t const init = (is_parent_empty or c.is_empty()) ? hash(c.type(), 0) : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset()); return std::accumulate(c.child_begin(), From 63af02d65683a0b0bf34262911dcf37f8e6a2199 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 25 Sep 2021 01:37:50 +0530 Subject: [PATCH 40/79] add groupby correlation tests --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/groupby/correlation_tests.cpp | 163 ++++++++++++++++++++++++ cpp/tests/groupby/mean_tests.cpp | 60 --------- 3 files changed, 164 insertions(+), 60 deletions(-) create mode 100644 cpp/tests/groupby/correlation_tests.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cde170fb598..20f7211c882 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -59,6 +59,7 @@ ConfigureTest(GROUPBY_TEST groupby/argmax_tests.cpp groupby/collect_list_tests.cpp groupby/collect_set_tests.cpp + groupby/correlation_tests.cpp groupby/count_scan_tests.cpp groupby/count_tests.cpp groupby/groups_tests.cpp diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp new file mode 100644 index 00000000000..db238ae5998 --- /dev/null +++ b/cpp/tests/groupby/correlation_tests.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +using namespace cudf::test::iterators; +namespace cudf { +namespace test { + +using structs = structs_column_wrapper; + +template +struct groupby_correlation_test : public cudf::test::BaseFixture { +}; + +using supported_types = RemoveIf>, cudf::test::NumericTypes>; + +TYPED_TEST_CASE(groupby_correlation_test, supported_types); +using K = int32_t; + +TYPED_TEST(groupby_correlation_test, basic) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + fixed_width_column_wrapper expect_vals{ + {1.0, 0.6, std::numeric_limits::quiet_NaN()}}; + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_correlation_test, empty_cols) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys{}; + fixed_width_column_wrapper member_0{}, member_1{}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{}; + fixed_width_column_wrapper expect_vals{}; + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_correlation_test, zero_valid_keys) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys({1, 2, 3}, all_nulls()); + fixed_width_column_wrapper member_0{3, 4, 5}, member_1{6, 7, 8}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{}; + fixed_width_column_wrapper expect_vals{}; + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_correlation_test, zero_valid_values) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys{1, 1, 1}; + fixed_width_column_wrapper member_0({3, 4, 5}, all_nulls()); + fixed_width_column_wrapper member_1({3, 4, 5}, all_nulls()); + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1}; + fixed_width_column_wrapper expect_vals({0}, all_nulls()); + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_correlation_test, null_keys_and_values) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3, -1, 1, 4, 4}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + fixed_width_column_wrapper val1({1, 1, 1, 2, 0, 3, 3, -1, 0, 2, 2}); + auto vals = structs{{val0, val1}}; + + // { 1, 1, 2, 2, 2, 3, 3, 4} + fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); + // { 3, 6, 1, 4, 9, 2, 8, 3} + fixed_width_column_wrapper expect_vals( + {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0}); + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +struct groupby_dictionary_correlation_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_dictionary_correlation_test, basic) +{ + using V = int16_t; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = dictionary_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = dictionary_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + fixed_width_column_wrapper expect_vals{ + {1.0, 0.6, std::numeric_limits::quiet_NaN()}}; + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 9cbeca8163f..613e1555b79 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -161,64 +160,5 @@ TEST_F(groupby_dictionary_mean_test, basic) keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation()); } -struct groupby_corr_test : public cudf::test::BaseFixture { -}; -template -using fwcw = fixed_width_column_wrapper; -using structs = structs_column_wrapper; - -TEST_F(groupby_corr_test, basic) -{ - using K = int32_t; - using M0 = uint8_t; - using M1 = int16_t; - using R = cudf::detail::target_type_t; - - // clang-format off - auto keys = fwcw { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2 }; - auto member_0 = fwcw{{ 1, 1, 1, 2, 2, 3, 3, 1, 1, 4 }};//, null_at(1)}; - auto member_1 = fwcw{{ 1, 1, 1, 2, -2, 3, 3, 1, 1, -4 }};//, null_at(7)}; - auto values = structs{{member_0, member_1}};//, null_at(4)}; - // clang-format on - - fixed_width_column_wrapper expect_keys({1, 2, 3}); - fixed_width_column_wrapper expect_vals{ - {1.000000, -0.41522739926869984, std::numeric_limits::quiet_NaN()}}; //, null_at(2)}; - // clang-format on - - auto agg = - cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); - std::vector requests; - requests.emplace_back(groupby::aggregation_request()); - requests[0].values = values; - - requests[0].aggregations.push_back(std::move(agg)); - requests.emplace_back(groupby::aggregation_request()); - // WAR to force groupby to use sort implementation - requests[0].aggregations.push_back(make_nth_element_aggregation(0)); - - requests[1].values = column_view(values).child(0); - requests[1].aggregations.push_back(cudf::make_mean_aggregation()); - requests[1].aggregations.push_back(cudf::make_std_aggregation()); - requests.emplace_back(groupby::aggregation_request()); - requests[2].values = column_view(values).child(1); - requests[2].aggregations.push_back(cudf::make_mean_aggregation()); - requests[2].aggregations.push_back(cudf::make_std_aggregation()); - - groupby::groupby gb_obj(table_view({keys})); - auto result = gb_obj.aggregate(requests); - - cudf::test::print(*result.second[0].results[0]); - cudf::test::print(*result.second[1].results[0]); - cudf::test::print(*result.second[1].results[1]); - cudf::test::print(*result.second[2].results[0]); - cudf::test::print(*result.second[2].results[1]); - - CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({expect_keys}), result.first->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT( - expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS); - // test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); -} - } // namespace test } // namespace cudf From 14dd5bb1791e28659e2e0edad2c397ab3cb66a9e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 25 Sep 2021 01:39:43 +0530 Subject: [PATCH 41/79] enable dict for sort groupby mean --- cpp/src/groupby/sort/aggregate.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index c7954db5d75..03476b32151 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -235,11 +235,14 @@ void aggregate_result_functor::operator()(aggregation const& // TODO (dm): Special case for timestamp. Add target_type_impl for it. // Blocked until we support operator+ on timestamps + auto col_type = cudf::is_dictionary(values.type()) + ? cudf::dictionary_column_view(values).keys().type() + : values.type(); auto result = cudf::detail::binary_operation(sum_result, count_result, binary_operator::DIV, - cudf::detail::target_type(values.type(), aggregation::MEAN), + cudf::detail::target_type(col_type, aggregation::MEAN), stream, mr); cache.add_result(values, agg, std::move(result)); From b0fea0202d87e6869b5b31675a647785c61e0f3e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 25 Sep 2021 01:40:22 +0530 Subject: [PATCH 42/79] update group_corr for null support --- cpp/src/groupby/sort/aggregate.cpp | 46 +++++++-- cpp/src/groupby/sort/group_corr.cu | 111 ++++++++-------------- cpp/src/groupby/sort/group_reductions.hpp | 4 +- 3 files changed, 80 insertions(+), 81 deletions(-) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 03476b32151..f0931cd3b61 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -547,25 +548,56 @@ void aggregate_result_functor::operator()(aggregation CUDF_EXPECTS(corr_agg._type == correlation_type::PEARSON, "Only Pearson correlation is supported."); + // Correlation only for valid values in both columns. + auto [_, values_child0, values_child1] = [this]() { + rmm::device_buffer new_nullmask = + cudf::bitmask_and(table_view{{values.child(0), values.child(1)}}); + auto null_count = cudf::count_unset_bits( + static_cast(new_nullmask.data()), 0, values.size()); + if (null_count == 0) { + return std::make_tuple(std::move(new_nullmask), values.child(0), values.child(1)); + } + auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { + return column_view(col.type(), + col.size(), + col.head(), + static_cast(nullmask), + null_count, + col.offset(), + std::vector(col.child_begin(), col.child_end())); + }; + auto values_child0 = + null_count == values.child(0).null_count() + ? values.child(0) + : column_view_with_new_nullmask(values.child(0), new_nullmask.data(), null_count); + auto values_child1 = + null_count == values.child(1).null_count() + ? values.child(1) + : column_view_with_new_nullmask(values.child(1), new_nullmask.data(), null_count); + return std::make_tuple(std::move(new_nullmask), values_child0, values_child1); + }(); + auto std_agg = make_std_aggregation(); cudf::detail::aggregation_dispatcher( - std_agg->kind, aggregate_result_functor(values.child(0), helper, cache, stream, mr), *std_agg); + std_agg->kind, aggregate_result_functor(values_child0, helper, cache, stream, mr), *std_agg); cudf::detail::aggregation_dispatcher( - std_agg->kind, aggregate_result_functor(values.child(1), helper, cache, stream, mr), *std_agg); + std_agg->kind, aggregate_result_functor(values_child1, helper, cache, stream, mr), *std_agg); - auto const stddev0 = cache.get_result(values.child(0), *std_agg); - auto const stddev1 = cache.get_result(values.child(1), *std_agg); + auto const stddev0 = cache.get_result(values_child0, *std_agg); + auto const stddev1 = cache.get_result(values_child1, *std_agg); auto mean_agg = make_mean_aggregation(); - auto const mean0 = cache.get_result(values.child(0), *mean_agg); - auto const mean1 = cache.get_result(values.child(1), *mean_agg); + auto const mean0 = cache.get_result(values_child0, *mean_agg); + auto const mean1 = cache.get_result(values_child1, *mean_agg); + auto count_agg = make_count_aggregation(); + auto const count = cache.get_result(values_child0, *count_agg); cache.add_result(values, agg, detail::group_corr(get_grouped_values().child(0), get_grouped_values().child(1), - helper.group_offsets(stream), helper.group_labels(stream), helper.num_groups(stream), + count, mean0, mean1, stddev0, diff --git a/cpp/src/groupby/sort/group_corr.cu b/cpp/src/groupby/sort/group_corr.cu index 5c47676d4f9..4e3e916fdfb 100644 --- a/cpp/src/groupby/sort/group_corr.cu +++ b/cpp/src/groupby/sort/group_corr.cu @@ -14,31 +14,25 @@ * limitations under the License. */ +#include + #include #include #include -#include -#include -#include #include -#include -#include +#include #include #include -#include #include #include -#include -#include "cudf/types.hpp" -#include "groupby/sort/group_reductions.hpp" -#include "thrust/functional.h" -#include "thrust/iterator/counting_iterator.h" -#include "thrust/iterator/zip_iterator.h" +#include #include #include +#include + namespace cudf { namespace groupby { namespace detail { @@ -75,7 +69,7 @@ struct type_casted_accessor { }; template -struct corr_transform { // : thrust::unary_function +struct corr_transform { column_device_view const d_values_0, d_values_1; ResultType const *d_means_0, *d_means_1; ResultType const *d_stddev_0, *d_stddev_1; @@ -83,13 +77,21 @@ struct corr_transform { // : thrust::unary_function size_type const* d_group_labels; size_type ddof{1}; // TODO update based on bias. + __device__ static ResultType value(column_device_view const& view, size_type i) + { + bool const is_dict = view.type().id() == type_id::DICTIONARY32; + i = is_dict ? static_cast(view.element(i)) : i; + auto values_col = is_dict ? view.child(dictionary_column_view::keys_column_index) : view; + return type_dispatcher(values_col.type(), type_casted_accessor{}, i, values_col); + } + __device__ ResultType operator()(size_type i) { if (d_values_0.is_null(i) or d_values_1.is_null(i)) return 0.0; // This has to be device dispatch because x and y type may differ - auto x = type_dispatcher(d_values_0.type(), type_casted_accessor{}, i, d_values_0); - auto y = type_dispatcher(d_values_1.type(), type_casted_accessor{}, i, d_values_1); + auto x = value(d_values_0, i); + auto y = value(d_values_1, i); size_type group_idx = d_group_labels[i]; size_type group_size = d_group_sizes[group_idx]; @@ -109,9 +111,9 @@ struct corr_transform { // : thrust::unary_function // TODO Eventually this function should accept values_0, values_1, not a struct. std::unique_ptr group_corr(column_view const& values_0, column_view const& values_1, - cudf::device_span group_offsets, cudf::device_span group_labels, size_type num_groups, + column_view const& count, column_view const& mean_0, column_view const& mean_1, column_view const& stddev_0, @@ -125,27 +127,24 @@ std::unique_ptr group_corr(column_view const& values_0, result_type>); // check if each child type can be converted to float64. - bool const is_convertible = type_dispatcher(values_0.type(), is_double_convertible_impl{}) or - type_dispatcher(values_1.type(), is_double_convertible_impl{}); + auto get_base_type = [](auto const& col) { + return (col.type().id() == type_id::DICTIONARY32 + ? col.child(dictionary_column_view::keys_column_index) + : col) + .type(); + }; + bool const is_convertible = + type_dispatcher(get_base_type(values_0), is_double_convertible_impl{}) or + type_dispatcher(get_base_type(values_1), is_double_convertible_impl{}); CUDF_EXPECTS(is_convertible, "Input to `group_corr` must be columns of type convertible to float64."); - // TODO calculate COUNT_VALID (need to do for 2 seperately. for MEAN, and - // bitmask_and->COUNT_VALID for CORR.) - // TODO calculate CORR. (requires MEAN1, MEAN2, COUNT_VALID_ANDed, STDDEV1, STDDEV2) - // TODO shuffle. - auto mean0_ptr = mean_0.begin(); auto mean1_ptr = mean_1.begin(); auto stddev0_ptr = stddev_0.begin(); auto stddev1_ptr = stddev_1.begin(); - // TODO replace with ANDed bitmask. (values, stddev) - auto count1 = values_0.nullable() - ? detail::group_count_valid(values_0, group_labels, num_groups, stream, mr) - : detail::group_count_all(group_offsets, num_groups, stream, mr); - auto d_values_0 = column_device_view::create(values_0, stream); auto d_values_1 = column_device_view::create(values_1, stream); corr_transform corr_transform_op{*d_values_0, @@ -154,15 +153,11 @@ std::unique_ptr group_corr(column_view const& values_0, mean1_ptr, stddev0_ptr, stddev1_ptr, - count1->view().data(), + count.data(), group_labels.begin()}; - // result - auto const any_nulls = values_0.has_nulls() or values_1.has_nulls(); - auto mask_type = any_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED; - - auto result = - make_numeric_column(data_type(type_to_id()), num_groups, mask_type, stream, mr); + auto result = make_numeric_column( + data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); auto d_result = result->mutable_view().begin(); auto corr_iter = @@ -174,44 +169,16 @@ std::unique_ptr group_corr(column_view const& values_0, corr_iter, thrust::make_discard_iterator(), d_result); - return result; - - // auto result_M2s = make_numeric_column( - // data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); - // auto validities = rmm::device_uvector(num_groups, stream); - - // // Perform merging for all the aggregations. Their output (and their validity data) are written - // // out concurrently through an output zip iterator. - // using iterator_tuple = thrust::tuple; - // using output_iterator = thrust::zip_iterator; - // auto const out_iter = - // output_iterator{thrust::make_tuple(result_counts->mutable_view().template data(), - // result_means->mutable_view().template data(), - // result_M2s->mutable_view().template data(), - // validities.begin())}; - - // auto const count_valid = values.child(0); - // auto const mean_values = values.child(1); - // auto const M2_values = values.child(2); - // auto const iter = thrust::make_counting_iterator(0); - - // auto const fn = merge_fn{group_offsets.begin(), - // count_valid.template begin(), - // mean_values.template begin(), - // M2_values.template begin()}; - // thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn); - - // // Generate bitmask for the output. - // // Only mean and M2 values can be nullable. Count column must be non-nullable. - // auto [null_mask, null_count] = cudf::detail::valid_if( - // validities.begin(), validities.end(), thrust::identity{}, stream, mr); - // if (null_count > 0) { - // result_means->set_null_mask(null_mask, null_count); // copy null_mask - // result_M2s->set_null_mask(std::move(null_mask), null_count); // take over null_mask - // } - - // Output is a structs column containing the merged values of `COUNT_VALID`, `MEAN`, and `M2`. + auto is_null = [ddof = corr_transform_op.ddof] __device__(size_type group_size) { + return not(group_size == 0 or group_size - ddof <= 0); + }; + auto [new_nullmask, null_count] = + cudf::detail::valid_if(count.begin(), count.end(), is_null, stream, mr); + if (null_count != 0) { + result->set_null_mask(std::move(new_nullmask)); + result->set_null_count(null_count); + } return result; } diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 5bd658d8f76..7133da1a7e7 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -447,9 +447,9 @@ std::unique_ptr group_merge_m2(column_view const& values, * * @param values_0 The first grouped values column to correlate * @param values_1 The second grouped values column to correlate - * @param group_offsets Offsets of groups' starting points within @p values. * @param group_labels ID of group that the corresponding value belongs to * @param num_groups Number of groups. + * @param count The count of valid rows of the grouped values of both columns * @param mean_0 The mean of the first grouped values column * @param mean_1 The mean of the second grouped values column * @param stddev_0 The standard deviation of the first grouped values column @@ -459,9 +459,9 @@ std::unique_ptr group_merge_m2(column_view const& values, */ std::unique_ptr group_corr(column_view const& values_0, column_view const& values_1, - cudf::device_span group_offsets, cudf::device_span group_labels, size_type num_groups, + column_view const& count, column_view const& mean_0, column_view const& mean_1, column_view const& stddev_0, From 57db9014757564bd964acd00dde46d883bab928b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 25 Sep 2021 01:44:21 +0530 Subject: [PATCH 43/79] rename group_corr to group_correlation --- cpp/CMakeLists.txt | 2 +- cpp/src/groupby/sort/aggregate.cpp | 22 +++++++++--------- .../{group_corr.cu => group_correlation.cu} | 23 +++++++++---------- cpp/src/groupby/sort/group_reductions.hpp | 22 +++++++++--------- 4 files changed, 34 insertions(+), 35 deletions(-) rename cpp/src/groupby/sort/{group_corr.cu => group_correlation.cu} (88%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 959d858f2e6..ea29b6ab152 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -219,7 +219,7 @@ add_library(cudf src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu src/groupby/sort/group_collect.cu - src/groupby/sort/group_corr.cu + src/groupby/sort/group_correlation.cu src/groupby/sort/group_count.cu src/groupby/sort/group_m2.cu src/groupby/sort/group_max.cu diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index f0931cd3b61..25bca9c12b2 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -593,17 +593,17 @@ void aggregate_result_functor::operator()(aggregation cache.add_result(values, agg, - detail::group_corr(get_grouped_values().child(0), - get_grouped_values().child(1), - helper.group_labels(stream), - helper.num_groups(stream), - count, - mean0, - mean1, - stddev0, - stddev1, - stream, - mr)); + detail::group_correlation(get_grouped_values().child(0), + get_grouped_values().child(1), + helper.group_labels(stream), + helper.num_groups(stream), + count, + mean0, + mean1, + stddev0, + stddev1, + stream, + mr)); }; } // namespace detail diff --git a/cpp/src/groupby/sort/group_corr.cu b/cpp/src/groupby/sort/group_correlation.cu similarity index 88% rename from cpp/src/groupby/sort/group_corr.cu rename to cpp/src/groupby/sort/group_correlation.cu index 4e3e916fdfb..04a8c5909bb 100644 --- a/cpp/src/groupby/sort/group_corr.cu +++ b/cpp/src/groupby/sort/group_correlation.cu @@ -108,18 +108,17 @@ struct corr_transform { }; } // namespace -// TODO Eventually this function should accept values_0, values_1, not a struct. -std::unique_ptr group_corr(column_view const& values_0, - column_view const& values_1, - cudf::device_span group_labels, - size_type num_groups, - column_view const& count, - column_view const& mean_0, - column_view const& mean_1, - column_view const& stddev_0, - column_view const& stddev_1, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr group_correlation(column_view const& values_0, + column_view const& values_1, + cudf::device_span group_labels, + size_type num_groups, + column_view const& count, + column_view const& mean_0, + column_view const& mean_1, + column_view const& stddev_0, + column_view const& stddev_1, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using result_type = id_to_type; static_assert( diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 7133da1a7e7..6e2ba2815c5 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -457,17 +457,17 @@ std::unique_ptr group_merge_m2(column_view const& values, * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr group_corr(column_view const& values_0, - column_view const& values_1, - cudf::device_span group_labels, - size_type num_groups, - column_view const& count, - column_view const& mean_0, - column_view const& mean_1, - column_view const& stddev_0, - column_view const& stddev_1, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr group_correlation(column_view const& values_0, + column_view const& values_1, + cudf::device_span group_labels, + size_type num_groups, + column_view const& count, + column_view const& mean_0, + column_view const& mean_1, + column_view const& stddev_0, + column_view const& stddev_1, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** @endinternal * */ From 0d1a91e773d202c08c392827c69f8cace4dc4792 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 25 Sep 2021 02:31:06 +0530 Subject: [PATCH 44/79] update doc --- cpp/src/groupby/sort/group_reductions.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 6e2ba2815c5..a6c61b3a9fd 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -443,7 +443,6 @@ std::unique_ptr group_merge_m2(column_view const& values, rmm::mr::device_memory_resource* mr); /** * @brief Internal API to find correlation of child columns of a non-nullable struct column. - * TODO fill documentation. * * @param values_0 The first grouped values column to correlate * @param values_1 The second grouped values column to correlate @@ -454,8 +453,8 @@ std::unique_ptr group_merge_m2(column_view const& values, * @param mean_1 The mean of the second grouped values column * @param stddev_0 The standard deviation of the first grouped values column * @param stddev_1 The standard deviation of the second grouped values column - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory */ std::unique_ptr group_correlation(column_view const& values_0, column_view const& values_1, From 6cd47bce5762cdffb66d47d76845a921b0890da4 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 27 Sep 2021 12:44:52 +0530 Subject: [PATCH 45/79] minor comment corrections --- cpp/src/aggregation/aggregation.cpp | 2 +- cpp/src/groupby/sort/aggregate.cpp | 2 -- cpp/src/groupby/sort/group_correlation.cu | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 426fef279d9..26ab5936a74 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -699,7 +699,7 @@ std::unique_ptr make_merge_m2_aggregation() template std::unique_ptr make_merge_m2_aggregation(); template std::unique_ptr make_merge_m2_aggregation(); -/// Factory to create a CORR aggregation +/// Factory to create a CORRELATION aggregation template std::unique_ptr make_correlation_aggregation(correlation_type type) { diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 7ecf4f5855a..45227368097 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -580,8 +580,6 @@ void aggregate_result_functor::operator()(aggregation auto std_agg = make_std_aggregation(); aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()(*std_agg); aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()(*std_agg); - // cudf::detail::aggregation_dispatcher( - // std_agg->kind, aggregate_result_functor(values_child1, helper, cache, stream, mr), *std_agg); auto const stddev0 = cache.get_result(values_child0, *std_agg); auto const stddev1 = cache.get_result(values_child1, *std_agg); diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu index 04a8c5909bb..70a2cbd9bb8 100644 --- a/cpp/src/groupby/sort/group_correlation.cu +++ b/cpp/src/groupby/sort/group_correlation.cu @@ -53,8 +53,7 @@ struct is_double_convertible_impl { }; /** - * @brief Type casts each element of the column to `CastType` - * + * @brief Typecasts each element of the column to `CastType` */ template struct type_casted_accessor { From 075ec73edfb842cd483752f322773528c26b555c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 30 Sep 2021 22:49:16 +0530 Subject: [PATCH 46/79] add covariance, refactor correlation to use covariance --- cpp/include/cudf/aggregation.hpp | 11 ++ .../cudf/detail/aggregation/aggregation.hpp | 32 +++- cpp/src/aggregation/aggregation.cpp | 20 +++ cpp/src/groupby/sort/aggregate.cpp | 150 ++++++++++++------ cpp/src/groupby/sort/group_correlation.cu | 83 ++++++---- cpp/src/groupby/sort/group_reductions.hpp | 31 ++-- 6 files changed, 234 insertions(+), 93 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 7a62a64b6dc..9e556dbe704 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -88,6 +88,7 @@ class aggregation { MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries MERGE_M2, ///< merge partial values of M2 aggregation, + COVARIANCE, ///< covariance between two sets of elements CORRELATION, ///< correlation between two sets of elements TDIGEST, ///< create a tdigest from a set of input values MERGE_TDIGEST ///< create a tdigest by merging multiple tdigests together @@ -497,6 +498,15 @@ std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = nu template std::unique_ptr make_merge_m2_aggregation(); +/** + * @brief Factory to create a COVARIANCE aggregation + * + * Compute covariance between two columns. + * The input columns are child columns of a non-nullable struct columns. + */ +template +std::unique_ptr make_covariance_aggregation(); + /** * @brief Factory to create a CORRELATION aggregation * @@ -507,6 +517,7 @@ std::unique_ptr make_merge_m2_aggregation(); */ template std::unique_ptr make_correlation_aggregation(correlation_type type); + /** * @brief Factory to create a TDIGEST aggregation * diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index ff192c438c7..e12ed3f521e 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -91,6 +91,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class merge_sets_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_m2_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class covariance_aggregation const& agg); virtual std::vector> visit(data_type col_type, class correlation_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -131,6 +133,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); + virtual void visit(class covariance_aggregation const& agg); virtual void visit(class correlation_aggregation const& agg); virtual void visit(class tdigest_aggregation const& agg); virtual void visit(class merge_tdigest_aggregation const& agg); @@ -893,6 +896,25 @@ class merge_m2_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived aggregation class for specifying COVARIANCE aggregation + */ +class covariance_aggregation final : public groupby_aggregation { + public: + explicit covariance_aggregation() : aggregation{COVARIANCE} {} + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived aggregation class for specifying CORRELATION aggregation */ @@ -1209,7 +1231,13 @@ struct target_type_impl { using type = struct_view; }; -// Always use struct for CORRELATION +// Always use double for COVARIANCE +template +struct target_type_impl { + using type = double; +}; + +// Always use double for CORRELATION template struct target_type_impl { using type = double; @@ -1337,6 +1365,8 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::MERGE_M2: return f.template operator()(std::forward(args)...); + case aggregation::COVARIANCE: + return f.template operator()(std::forward(args)...); case aggregation::CORRELATION: return f.template operator()(std::forward(args)...); case aggregation::TDIGEST: diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 26ab5936a74..c2b7449fc96 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -202,6 +202,11 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, covariance_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} std::vector> simple_aggregations_collector::visit( data_type col_type, correlation_aggregation const& agg) { @@ -363,10 +368,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(covariance_aggregation const& agg) +{ + visit(static_cast(agg)); +} + void aggregation_finalizer::visit(correlation_aggregation const& agg) { visit(static_cast(agg)); } + void aggregation_finalizer::visit(tdigest_aggregation const& agg) { visit(static_cast(agg)); @@ -699,6 +710,15 @@ std::unique_ptr make_merge_m2_aggregation() template std::unique_ptr make_merge_m2_aggregation(); template std::unique_ptr make_merge_m2_aggregation(); +/// Factory to create a COVARIANCE aggregation +template +std::unique_ptr make_covariance_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_covariance_aggregation(); +template std::unique_ptr make_covariance_aggregation(); + /// Factory to create a CORRELATION aggregation template std::unique_ptr make_correlation_aggregation(correlation_type type) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 45227368097..7cddfef5712 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -529,6 +529,77 @@ void aggregate_result_functor::operator()(aggregation con get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); }; +/** + * @brief Creates column views with only valid elements in both input column views + * + * @param column_0 The first column + * @param column_1 The second column + * @return tuple with new null mask (if null masks if input differ) and new column views + */ +auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1) +{ + rmm::device_buffer new_nullmask = cudf::bitmask_and(table_view{{column_0, column_1}}); + auto null_count = cudf::count_unset_bits( + static_cast(new_nullmask.data()), 0, column_0.size()); + if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } + auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { + return column_view(col.type(), + col.size(), + col.head(), + static_cast(nullmask), + null_count, + col.offset(), + std::vector(col.child_begin(), col.child_end())); + }; + auto new_column_0 = null_count == column_0.null_count() + ? column_0 + : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count); + auto new_column_1 = null_count == column_1.null_count() + ? column_1 + : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count); + return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1); +} + +/** + * @brief Perform covariance betweeen two child columns of non-nullable struct column. + * + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(values, agg)) { return; } + CUDF_EXPECTS(values.type().id() == type_id::STRUCT, + "Input to `groupby covariance` must be a structs column."); + CUDF_EXPECTS(values.num_children() == 2, + "Input to `groupby covariance` must be a structs column having 2 children columns."); + + // Covariance only for valid values in both columns. + // in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT. + auto [_, values_child0, values_child1] = + column_view_with_common_nulls(values.child(0), values.child(1)); + + auto mean_agg = make_mean_aggregation(); + aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()(*mean_agg); + aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()(*mean_agg); + + auto const mean0 = cache.get_result(values_child0, *mean_agg); + auto const mean1 = cache.get_result(values_child1, *mean_agg); + auto count_agg = make_count_aggregation(); + auto const count = cache.get_result(values_child0, *count_agg); + + cache.add_result(values, + agg, + detail::group_covariance(get_grouped_values().child(0), + get_grouped_values().child(1), + helper.group_labels(stream), + helper.num_groups(stream), + count, + mean0, + mean1, + stream, + mr)); +}; + /** * @brief Perform correlation betweeen two child columns of non-nullable struct column. * @@ -538,44 +609,21 @@ void aggregate_result_functor::operator()(aggregation { if (cache.has_result(values, agg)) { return; } CUDF_EXPECTS(values.type().id() == type_id::STRUCT, - "Input to `group_corr` must be a structs column."); - CUDF_EXPECTS(values.num_children() == 2, - "Input to `group_corr` must be a structs column having 2 children columns."); + "Input to `groupby correlation` must be a structs column."); + CUDF_EXPECTS( + values.num_children() == 2, + "Input to `groupby correlation` must be a structs column having 2 children columns."); CUDF_EXPECTS(values.nullable() == false, - "Input to `group_corr` must be a non-nullable structs column."); + "Input to `groupby correlation` must be a non-nullable structs column."); auto const& corr_agg = dynamic_cast(agg); CUDF_EXPECTS(corr_agg._type == correlation_type::PEARSON, "Only Pearson correlation is supported."); // Correlation only for valid values in both columns. - auto [_, values_child0, values_child1] = [this]() { - rmm::device_buffer new_nullmask = - cudf::bitmask_and(table_view{{values.child(0), values.child(1)}}); - auto null_count = cudf::count_unset_bits( - static_cast(new_nullmask.data()), 0, values.size()); - if (null_count == 0) { - return std::make_tuple(std::move(new_nullmask), values.child(0), values.child(1)); - } - auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { - return column_view(col.type(), - col.size(), - col.head(), - static_cast(nullmask), - null_count, - col.offset(), - std::vector(col.child_begin(), col.child_end())); - }; - auto values_child0 = - null_count == values.child(0).null_count() - ? values.child(0) - : column_view_with_new_nullmask(values.child(0), new_nullmask.data(), null_count); - auto values_child1 = - null_count == values.child(1).null_count() - ? values.child(1) - : column_view_with_new_nullmask(values.child(1), new_nullmask.data(), null_count); - return std::make_tuple(std::move(new_nullmask), values_child0, values_child1); - }(); + // in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT + auto [_, values_child0, values_child1] = + column_view_with_common_nulls(values.child(0), values.child(1)); auto std_agg = make_std_aggregation(); aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()(*std_agg); @@ -583,26 +631,30 @@ void aggregate_result_functor::operator()(aggregation auto const stddev0 = cache.get_result(values_child0, *std_agg); auto const stddev1 = cache.get_result(values_child1, *std_agg); - auto mean_agg = make_mean_aggregation(); - auto const mean0 = cache.get_result(values_child0, *mean_agg); - auto const mean1 = cache.get_result(values_child1, *mean_agg); - auto count_agg = make_count_aggregation(); - auto const count = cache.get_result(values_child0, *count_agg); + auto mean_agg = make_mean_aggregation(); + auto const mean0 = cache.get_result(values_child0, *mean_agg); + auto const mean1 = cache.get_result(values_child1, *mean_agg); + auto count_agg = make_count_aggregation(); + auto const count = cache.get_result(values_child0, *count_agg); + + // Compute covariance here to avoid repeated computation of mean & count + auto cov_agg = make_covariance_aggregation(); cache.add_result(values, - agg, - detail::group_correlation(get_grouped_values().child(0), - get_grouped_values().child(1), - helper.group_labels(stream), - helper.num_groups(stream), - count, - mean0, - mean1, - stddev0, - stddev1, - stream, - mr)); -}; + *cov_agg, + detail::group_covariance(get_grouped_values().child(0), + get_grouped_values().child(1), + helper.group_labels(stream), + helper.num_groups(stream), + count, + mean0, + mean1, + stream, + mr)); + auto const covariance = cache.get_result(values, *cov_agg); + cache.add_result( + values, agg, detail::group_correlation(covariance, stddev0, stddev1, stream, mr)); +} /** * @brief Generate a tdigest column from a grouped set of numeric input values. diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu index 70a2cbd9bb8..0c11f62ea3b 100644 --- a/cpp/src/groupby/sort/group_correlation.cu +++ b/cpp/src/groupby/sort/group_correlation.cu @@ -68,10 +68,9 @@ struct type_casted_accessor { }; template -struct corr_transform { +struct covariance_transform { column_device_view const d_values_0, d_values_1; ResultType const *d_means_0, *d_means_1; - ResultType const *d_stddev_0, *d_stddev_1; size_type const* d_group_sizes; size_type const* d_group_labels; size_type ddof{1}; // TODO update based on bias. @@ -98,26 +97,22 @@ struct corr_transform { // prevent divide by zero error if (group_size == 0 or group_size - ddof <= 0) return 0.0; - ResultType xmean = d_means_0[group_idx]; - ResultType ymean = d_means_1[group_idx]; - ResultType xstddev = d_stddev_0[group_idx]; - ResultType ystddev = d_stddev_1[group_idx]; - return (x - xmean) * (y - ymean) / (group_size - ddof) / xstddev / ystddev; + ResultType xmean = d_means_0[group_idx]; + ResultType ymean = d_means_1[group_idx]; + return (x - xmean) * (y - ymean) / (group_size - ddof); } }; } // namespace -std::unique_ptr group_correlation(column_view const& values_0, - column_view const& values_1, - cudf::device_span group_labels, - size_type num_groups, - column_view const& count, - column_view const& mean_0, - column_view const& mean_1, - column_view const& stddev_0, - column_view const& stddev_1, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr group_covariance(column_view const& values_0, + column_view const& values_1, + cudf::device_span group_labels, + size_type num_groups, + column_view const& count, + column_view const& mean_0, + column_view const& mean_1, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using result_type = id_to_type; static_assert( @@ -136,30 +131,22 @@ std::unique_ptr group_correlation(column_view const& values_0, type_dispatcher(get_base_type(values_1), is_double_convertible_impl{}); CUDF_EXPECTS(is_convertible, - "Input to `group_corr` must be columns of type convertible to float64."); + "Input to `group_correlation` must be columns of type convertible to float64."); - auto mean0_ptr = mean_0.begin(); - auto mean1_ptr = mean_1.begin(); - auto stddev0_ptr = stddev_0.begin(); - auto stddev1_ptr = stddev_1.begin(); + auto mean0_ptr = mean_0.begin(); + auto mean1_ptr = mean_1.begin(); auto d_values_0 = column_device_view::create(values_0, stream); auto d_values_1 = column_device_view::create(values_1, stream); - corr_transform corr_transform_op{*d_values_0, - *d_values_1, - mean0_ptr, - mean1_ptr, - stddev0_ptr, - stddev1_ptr, - count.data(), - group_labels.begin()}; + covariance_transform covariance_transform_op{ + *d_values_0, *d_values_1, mean0_ptr, mean1_ptr, count.data(), group_labels.begin()}; auto result = make_numeric_column( data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); auto d_result = result->mutable_view().begin(); auto corr_iter = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), corr_transform_op); + thrust::make_transform_iterator(thrust::make_counting_iterator(0), covariance_transform_op); thrust::reduce_by_key(rmm::exec_policy(stream), group_labels.begin(), @@ -168,7 +155,7 @@ std::unique_ptr group_correlation(column_view const& values_0, thrust::make_discard_iterator(), d_result); - auto is_null = [ddof = corr_transform_op.ddof] __device__(size_type group_size) { + auto is_null = [ddof = covariance_transform_op.ddof] __device__(size_type group_size) { return not(group_size == 0 or group_size - ddof <= 0); }; auto [new_nullmask, null_count] = @@ -180,6 +167,36 @@ std::unique_ptr group_correlation(column_view const& values_0, return result; } +std::unique_ptr group_correlation(column_view const& covariance, + column_view const& stddev_0, + column_view const& stddev_1, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using result_type = id_to_type; + CUDF_EXPECTS(covariance.type().id() == type_id::FLOAT64, + "Covariance result as FLOAT64 is supported"); + auto stddev0_ptr = stddev_0.begin(); + auto stddev1_ptr = stddev_1.begin(); + auto stddev_iter = thrust::make_zip_iterator(thrust::make_tuple(stddev0_ptr, stddev1_ptr)); + auto result = make_numeric_column(covariance.type(), + covariance.size(), + cudf::detail::copy_bitmask(covariance, stream, mr), + covariance.null_count(), + stream, + mr); + auto d_result = result->mutable_view().begin(); + thrust::transform(rmm::exec_policy(stream), + covariance.begin(), + covariance.end(), + stddev_iter, + d_result, + [] __device__(auto const covariance, auto const stddev) { + return covariance / thrust::get<0>(stddev) / thrust::get<1>(stddev); + }); + return result; +} + } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index a56a3ae38cd..c40ef56a839 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -442,27 +442,38 @@ std::unique_ptr group_merge_m2(column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** - * @brief Internal API to find correlation of child columns of a non-nullable struct column. + * @brief Internal API to find covariance of child columns of a non-nullable struct column. * - * @param values_0 The first grouped values column to correlate - * @param values_1 The second grouped values column to correlate + * @param values_0 The first grouped values column to compute covariance + * @param values_1 The second grouped values column to compute covariance * @param group_labels ID of group that the corresponding value belongs to * @param num_groups Number of groups. * @param count The count of valid rows of the grouped values of both columns * @param mean_0 The mean of the first grouped values column * @param mean_1 The mean of the second grouped values column + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + */ +std::unique_ptr group_covariance(column_view const& values_0, + column_view const& values_1, + cudf::device_span group_labels, + size_type num_groups, + column_view const& count, + column_view const& mean_0, + column_view const& mean_1, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to find correlation from covariance and standard deviation. + * + * @param covariance The covariance of two grouped values columns * @param stddev_0 The standard deviation of the first grouped values column * @param stddev_1 The standard deviation of the second grouped values column * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory */ -std::unique_ptr group_correlation(column_view const& values_0, - column_view const& values_1, - cudf::device_span group_labels, - size_type num_groups, - column_view const& count, - column_view const& mean_0, - column_view const& mean_1, +std::unique_ptr group_correlation(column_view const& covariance, column_view const& stddev_0, column_view const& stddev_1, rmm::cuda_stream_view stream, From 60532e86525ba1b7a68bd28d791a31f02312df3d Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 30 Sep 2021 12:33:40 -0700 Subject: [PATCH 47/79] create new PR --- python/cudf/cudf/_lib/aggregation.pyx | 21 +++++++++++++++ python/cudf/cudf/_lib/cpp/aggregation.pxd | 10 ++++++++ python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/core/groupby/groupby.py | 31 +++++++++++++++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 4f703724cef..76eb3ba3bb2 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -57,6 +57,7 @@ class AggregationKind(Enum): UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA + CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION cdef class Aggregation: @@ -321,6 +322,15 @@ cdef class Aggregation: )) return agg + @classmethod + def corr(cls): + cdef Aggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation.make_correlation_aggregation[aggregation]( + libcudf_aggregation.correlation_type.PEARSON + )) + return agg + cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -692,6 +702,17 @@ cdef class GroupbyAggregation: ) return agg + @classmethod + def corr(cls): + cdef GroupbyAggregation agg = cls() + agg.c_obj = move( + libcudf_aggregation. + make_correlation_aggregation[groupby_aggregation]( + libcudf_aggregation.correlation_type.PEARSON + )) + + return agg + cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 13bfa49057c..04deeb877d1 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -38,6 +38,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' + CORRELATION 'cudf::aggregation::CORRELATION' + Kind kind cdef cppclass rolling_aggregation: @@ -53,6 +55,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' + ctypedef enum correlation_type: + PEARSON 'cudf::correlation_type::PEARSON' + KENDALL 'cudf::correlation_type::KENDALL' + SPEARMAN 'cudf::correlation_type::SPEARMAN' + cdef unique_ptr[T] make_sum_aggregation[T]() except + cdef unique_ptr[T] make_product_aggregation[T]() except + @@ -106,3 +113,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: udf_type type, string user_defined_aggregator, data_type output_type) except + + + cdef unique_ptr[T] make_correlation_aggregation[T]( + correlation_type type) except + diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 19ef6555a6e..a41b7c79520 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} _LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = set() +_STRUCT_AGGS = {"CORRELATION"} _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e00d964f989..5f04be89be8 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import collections +import itertools import pickle import warnings @@ -781,6 +782,36 @@ def median(self): """Get the column-wise median of the values in each group.""" return self.agg("median") + def corr(self): + # breakpoint() + _cols = self.grouping.values.columns.tolist() + new_df = cudf.DataFrame({self.grouping.keys.names: self.grouping.keys}) + new_df._data.multiindex = False + for i in tuple(itertools.combinations_with_replacement(_cols, 2)): + new_df[i] = cudf.DataFrame( + {"x": self.obj[i[0]], "y": self.obj[i[1]]} + ).to_struct() + new_gb = new_df.groupby(self.grouping) + gb_corr = new_gb.agg("corr") + + cols_list = [] + for i, x in enumerate(_cols): + for j, y in enumerate(_cols): + if i > j: + cols_list.append((_cols[j], _cols[i])) + else: + cols_list.append((_cols[i], _cols[j])) + cols_split = [ + cols_list[i : i + 3] for i in range(0, len(cols_list), 3) + ] + + res = cudf.DataFrame() + for i, x in zip(cols_split, _cols): + ic = gb_corr.loc[:, i].interleave_columns() + res[x] = ic + + return res + def var(self, ddof=1): """Compute the column-wise variance of the values in each group. From 077a1872a785374904d92d38497cce284908c030 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 4 Oct 2021 21:17:09 +0530 Subject: [PATCH 48/79] add more null cases for correlation tests --- cpp/tests/groupby/correlation_tests.cpp | 60 +++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp index db238ae5998..90d230ef1eb 100644 --- a/cpp/tests/groupby/correlation_tests.cpp +++ b/cpp/tests/groupby/correlation_tests.cpp @@ -119,16 +119,16 @@ TYPED_TEST(groupby_correlation_test, null_keys_and_values) using V = TypeParam; using R = cudf::detail::target_type_t; + // clang-format off fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); - fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3, -1, 1, 4, 4}, + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - fixed_width_column_wrapper val1({1, 1, 1, 2, 0, 3, 3, -1, 0, 2, 2}); + fixed_width_column_wrapper val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2}); + // clang-format on auto vals = structs{{val0, val1}}; - // { 1, 1, 2, 2, 2, 3, 3, 4} fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); - // { 3, 6, 1, 4, 9, 2, 8, 3} fixed_width_column_wrapper expect_vals( {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0}); @@ -137,6 +137,58 @@ TYPED_TEST(groupby_correlation_test, null_keys_and_values) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); } +TYPED_TEST(groupby_correlation_test, null_values_same) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + fixed_width_column_wrapper val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + // clang-format on + auto vals = structs{{val0, val1}}; + + fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); + fixed_width_column_wrapper expect_vals( + {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0}); + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +// keys=[1, 1, 1, 2, 2, 2, 2, 3, N, 3, 4] +// val0=[N, 2, 3, 1, N, 3, 4, 1,-1, 1, 4] +// val1=[N, 2, 3, 2,-1, 6,-6/1, 1,-1, 0, N] +// corr=[ 1.0, -0.5/0, NAN, NAN] +TYPED_TEST(groupby_correlation_test, null_values_different) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4}, + {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); + fixed_width_column_wrapper val1({1, 2, 1, 2,-1, 6, 3,-1, 0, 1, 2}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + // clang-format on + auto vals = structs{{val0, val1}}; + + fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); + fixed_width_column_wrapper expect_vals({1.0, 0., std::numeric_limits::quiet_NaN(), 0.}, + {1, 1, 1, 0}); + + auto agg = + cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + struct groupby_dictionary_correlation_test : public cudf::test::BaseFixture { }; From e3f47c13c13bb2c41769163001e00253e73fe1be Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 4 Oct 2021 23:23:08 +0530 Subject: [PATCH 49/79] add covariance tests --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/groupby/covariance_tests.cpp | 199 +++++++++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 cpp/tests/groupby/covariance_tests.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ea7338b4da1..d32b18cb929 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -62,6 +62,7 @@ ConfigureTest(GROUPBY_TEST groupby/correlation_tests.cpp groupby/count_scan_tests.cpp groupby/count_tests.cpp + groupby/covariance_tests.cpp groupby/groups_tests.cpp groupby/keys_tests.cpp groupby/lists_tests.cpp diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp new file mode 100644 index 00000000000..039fce16222 --- /dev/null +++ b/cpp/tests/groupby/covariance_tests.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +using namespace cudf::test::iterators; +namespace cudf { +namespace test { + +using structs = structs_column_wrapper; + +template +struct groupby_covariance_test : public cudf::test::BaseFixture { +}; + +using supported_types = RemoveIf>, cudf::test::NumericTypes>; + +TYPED_TEST_CASE(groupby_covariance_test, supported_types); +using K = int32_t; + +TYPED_TEST(groupby_covariance_test, basic) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + fixed_width_column_wrapper expect_vals{{1.0, 1.0, 0.0}}; + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, empty_cols) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys{}; + fixed_width_column_wrapper member_0{}, member_1{}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{}; + fixed_width_column_wrapper expect_vals{}; + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, zero_valid_keys) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys({1, 2, 3}, all_nulls()); + fixed_width_column_wrapper member_0{3, 4, 5}, member_1{6, 7, 8}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{}; + fixed_width_column_wrapper expect_vals{}; + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, zero_valid_values) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + fixed_width_column_wrapper keys{1, 1, 1}; + fixed_width_column_wrapper member_0({3, 4, 5}, all_nulls()); + fixed_width_column_wrapper member_1({3, 4, 5}, all_nulls()); + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1}; + fixed_width_column_wrapper expect_vals({0}, all_nulls()); + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, null_keys_and_values) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + fixed_width_column_wrapper val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2}); + // clang-format on + auto vals = structs{{val0, val1}}; + + fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); + fixed_width_column_wrapper expect_vals({0.5, 1.0, 0.0, -0.}, {1, 1, 1, 0}); + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, null_values_same) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + fixed_width_column_wrapper val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + // clang-format on + auto vals = structs{{val0, val1}}; + + fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); + fixed_width_column_wrapper expect_vals({0.5, 1.0, 0.0, -0.}, {1, 1, 1, 0}); + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +TYPED_TEST(groupby_covariance_test, null_values_different) +{ + using V = TypeParam; + using R = cudf::detail::target_type_t; + + // clang-format off + fixed_width_column_wrapper keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}); + fixed_width_column_wrapper val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4}, + {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); + fixed_width_column_wrapper val1({1, 2, 1, 2,-1, 3, 3,-1, 0, 4, 2}, + {0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0}); + // clang-format on + auto vals = structs{{val0, val1}}; + + fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls()); + fixed_width_column_wrapper expect_vals( + {std::numeric_limits::quiet_NaN(), 1.5, 0.0, -0.}, {0, 1, 1, 0}); + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +struct groupby_dictionary_covariance_test : public cudf::test::BaseFixture { +}; + +TEST_F(groupby_dictionary_covariance_test, basic) +{ + using V = int16_t; + using R = cudf::detail::target_type_t; + + auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}}; + auto member_0 = dictionary_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}}; + auto member_1 = dictionary_column_wrapper{{1, 1, 1, 2, 3, -3, 3, 1, 1, 2}}; + auto vals = structs{{member_0, member_1}}; + + fixed_width_column_wrapper expect_keys{1, 2, 3}; + fixed_width_column_wrapper expect_vals{{1.0, -0.5, 0.0}}; + + auto agg = cudf::make_covariance_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES); +} + +} // namespace test +} // namespace cudf From 9c5b81d7b2524ce65b324ca379ab6ed4a51e56f7 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 6 Oct 2021 16:02:51 -0700 Subject: [PATCH 50/79] fixed merge conflict in result_cache.hpp --- cpp/include/cudf/detail/aggregation/result_cache.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index e758feb1fde..41f5c19f06a 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -34,11 +34,7 @@ struct pair_column_aggregation_equal_to { }; struct pair_column_aggregation_hash { -<<<<<<< HEAD - size_t operator()(std::pair const& key) const noexcept -======= size_t operator()(std::pair const& key) const ->>>>>>> 3f09f967fe07246138ff6cfbed84675960a75f94 { return hash_combine(shallow_hash(key.first), key.second.do_hash()); } From 8426f563200ae365bbd79dbf31e56e9d107e84f1 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 8 Oct 2021 22:23:00 +0530 Subject: [PATCH 51/79] Apply suggestions from code review Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> --- cpp/src/groupby/sort/aggregate.cpp | 2 +- cpp/src/groupby/sort/group_correlation.cu | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 7cddfef5712..e471fccda07 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -534,7 +534,7 @@ void aggregate_result_functor::operator()(aggregation con * * @param column_0 The first column * @param column_1 The second column - * @return tuple with new null mask (if null masks if input differ) and new column views + * @return tuple with new null mask (if null masks of input differ) and new column views */ auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1) { diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu index 0c11f62ea3b..daf99563270 100644 --- a/cpp/src/groupby/sort/group_correlation.cu +++ b/cpp/src/groupby/sort/group_correlation.cu @@ -88,17 +88,17 @@ struct covariance_transform { if (d_values_0.is_null(i) or d_values_1.is_null(i)) return 0.0; // This has to be device dispatch because x and y type may differ - auto x = value(d_values_0, i); - auto y = value(d_values_1, i); + auto const x = value(d_values_0, i); + auto const y = value(d_values_1, i); - size_type group_idx = d_group_labels[i]; - size_type group_size = d_group_sizes[group_idx]; + size_type const group_idx = d_group_labels[i]; + size_type const group_size = d_group_sizes[group_idx]; // prevent divide by zero error if (group_size == 0 or group_size - ddof <= 0) return 0.0; - ResultType xmean = d_means_0[group_idx]; - ResultType ymean = d_means_1[group_idx]; + ResultType const xmean = d_means_0[group_idx]; + ResultType const ymean = d_means_1[group_idx]; return (x - xmean) * (y - ymean) / (group_size - ddof); } }; From f7470d2e8fb4f1054cfe3cf9596f949909aaf1a1 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 13 Oct 2021 10:11:58 -0700 Subject: [PATCH 52/79] fixing multiindex to match pandas behavior --- python/cudf/cudf/core/groupby/groupby.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 34335a52cdf..8597127fbe1 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -809,6 +809,13 @@ def corr(self): ic = gb_corr.loc[:, i].interleave_columns() res[x] = ic + _index = cudf.DataFrame( + { + self.grouping.keys.names[0]: self.grouping.keys, + None: _cols * (len(_cols)), + } + ) + res.index = _index return res def var(self, ddof=1): From 407b616ab523c50890743ff71a85449be6bd12ef Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 19 Oct 2021 13:56:56 -0700 Subject: [PATCH 53/79] adding tests --- python/cudf/cudf/tests/test_dataframe.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c1eade0fcdc..993e20c7ac7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8932,3 +8932,29 @@ def test_frame_series_where_other(data): expected = gdf.where(gdf["b"] == 1, 0) actual = pdf.where(pdf["b"] == 1, 0) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + { + "id": [0] * 4 + [1] * 3, + "a": [10, 3, 4, 2, -3, 9, 10], + "b": [10, 23, -4, 2, -3, 9, 19], + "c": [10, -23, -4, 21, -3, 19, 19], + }, + ], +) +def test_dataframe_pearson_corr(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = gdf.groupby("id").corr() + actual = pdf.groupby("id").corr() + assert_eq(expected, actual) From c58cff3e81e5e7e0a1cb50e3fcfb976cb0067b35 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 20 Oct 2021 22:13:06 -0700 Subject: [PATCH 54/79] added method parameter to corr() --- python/cudf/cudf/core/groupby/groupby.py | 70 +++++++++++++++++++++++- python/cudf/cudf/tests/test_dataframe.py | 4 +- 2 files changed, 70 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8597127fbe1..9520a982899 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -781,8 +781,74 @@ def median(self): """Get the column-wise median of the values in each group.""" return self.agg("median") - def corr(self): - # breakpoint() + def corr(self, method="pearson"): + """ + Compute pairwise correlation of columns, excluding NA/null values. + + Parameters + ---------- + method: Method of correlation + {‘pearson’, ‘kendall’, ‘spearman’} or callable + + pearson : standard correlation coefficient + + kendall : Kendall Tau correlation coefficient + + spearman : Spearman rank correlation + + callable: callable with input two 1d ndarrays and returning + float. Note that the returned matrix from corr will have 1 + along the diagonals and will be symmetric regardless of the + callable’s behavior. + + min_periods: int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + Returns + ---------- + DataFrame + Correlation matrix. + + Examples + -------- + >>> import cudf + >>> gdf = cudf.DataFrame({ + ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]}) + >>> gdf + id val1 val2 val3 + 0 a 5 4 4 + 1 a 4 5 5 + 2 a 6 6 6 + 3 b 4 1 1 + 4 b 8 2 2 + 5 b 7 9 9 + 6 c 4 8 8 + 7 c 5 5 5 + 8 c 2 1 1 + >>> gdf.groupby("id").corr(method="pearson") + val1 val2 val3 + id + a val1 1.000000 0.500000 0.500000 + val2 0.500000 1.000000 1.000000 + val3 0.500000 1.000000 1.000000 + b val1 1.000000 0.385727 0.385727 + val2 0.385727 1.000000 1.000000 + val3 0.385727 1.000000 1.000000 + c val1 1.000000 0.714575 0.714575 + val2 0.714575 1.000000 1.000000 + val3 0.714575 1.000000 1.000000 + + """ + + if method in ["kendall", "spearman"]: + raise NotImplementedError( + "Only pearson correlation is currently supported" + ) + _cols = self.grouping.values.columns.tolist() new_df = cudf.DataFrame({self.grouping.keys.names: self.grouping.keys}) new_df._data.multiindex = False diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 993e20c7ac7..67b101901d1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8955,6 +8955,6 @@ def test_dataframe_pearson_corr(data): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - expected = gdf.groupby("id").corr() - actual = pdf.groupby("id").corr() + expected = gdf.groupby("id").corr("pearson") + actual = pdf.groupby("id").corr("pearson") assert_eq(expected, actual) From 70be97bdc00df6c27fde754d0580e177ada78062 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 27 Oct 2021 22:56:42 -0700 Subject: [PATCH 55/79] create multiindex using groupby correlated index info --- python/cudf/cudf/core/groupby/groupby.py | 42 +++++++++++------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f49b6f0fc49..6e01b5f0eff 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -781,27 +781,18 @@ def median(self): """Get the column-wise median of the values in each group.""" return self.agg("median") - def corr(self, method="pearson"): + def corr(self, method="pearson", min_periods=1): """ Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- - method: Method of correlation - {‘pearson’, ‘kendall’, ‘spearman’} or callable + method: Method of correlation, default 'Pearson' + Pearson: standard correlation coefficient. + Kendall, Spearman correlation and callable method + not yet supported. - pearson : standard correlation coefficient - - kendall : Kendall Tau correlation coefficient - - spearman : Spearman rank correlation - - callable: callable with input two 1d ndarrays and returning - float. Note that the returned matrix from corr will have 1 - along the diagonals and will be symmetric regardless of the - callable’s behavior. - - min_periods: int, optional + min_periods: int, default 1 Minimum number of observations required per pair of columns to have a valid result. @@ -844,11 +835,12 @@ def corr(self, method="pearson"): """ - if method in ["kendall", "spearman"]: + if not method.lower() in ["pearson"]: raise NotImplementedError( "Only pearson correlation is currently supported" ) - + # create all combinations of the struct columns-pairs to be correlated + # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) _cols = self.grouping.values.columns.tolist() new_df = cudf.DataFrame({self.grouping.keys.names: self.grouping.keys}) new_df._data.multiindex = False @@ -858,7 +850,7 @@ def corr(self, method="pearson"): ).to_struct() new_gb = new_df.groupby(self.grouping) gb_corr = new_gb.agg("corr") - + # ensure that column-pair labels are arranged in ascending order cols_list = [] for i, x in enumerate(_cols): for j, y in enumerate(_cols): @@ -867,18 +859,22 @@ def corr(self, method="pearson"): else: cols_list.append((_cols[i], _cols[j])) cols_split = [ - cols_list[i : i + 3] for i in range(0, len(cols_list), 3) + cols_list[i : i + len(_cols)] + for i in range(0, len(cols_list), len(_cols)) ] - + # interleave: combine the correlation results of each column-pair + # into a single column res = cudf.DataFrame() for i, x in zip(cols_split, _cols): ic = gb_corr.loc[:, i].interleave_columns() res[x] = ic - + # create a multiindex for the groupby correlated dataframe, + # to match pandas behavior + _idx = gb_corr._index.to_pandas().values.tolist() _index = cudf.DataFrame( { - self.grouping.keys.names[0]: self.grouping.keys, - None: _cols * (len(_cols)), + gb_corr.index.name: sorted(_idx * len(_cols)), + None: _cols * (len(gb_corr.index)), } ) res.index = _index From f906b7930a391215163ca6cda76a6ec0e0323379 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 27 Oct 2021 22:58:34 -0700 Subject: [PATCH 56/79] added tests - one, two, three columns cases --- python/cudf/cudf/tests/test_dataframe.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 67b101901d1..5ba0f94e810 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8943,6 +8943,12 @@ def test_frame_series_where_other(data): "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], }, + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [5, 4, 6, 8, 7, 2], + "val2": [4, 5, 1, 2, 9, 5], + }, + {"id": ["a", "a", "b", "b", "c", "c"], "val": [10, 3, 4, 2, -3, 9]}, { "id": [0] * 4 + [1] * 3, "a": [10, 3, 4, 2, -3, 9, 10], @@ -8958,3 +8964,10 @@ def test_dataframe_pearson_corr(data): expected = gdf.groupby("id").corr("pearson") actual = pdf.groupby("id").corr("pearson") assert_eq(expected, actual) + + +def test_pearson_corr_empty_dataframe(): + with pytest.raises( + ValueError, match="Grouper and object must have same length" + ): + cudf.DataFrame().corr("pearson") From d800c8951a2884647028e5d317ef89ef83f66e5b Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 3 Nov 2021 15:16:42 -0700 Subject: [PATCH 57/79] added min_periods param. to cython layer --- python/cudf/cudf/_lib/aggregation.pyx | 42 +++++++++++++++++++---- python/cudf/cudf/_lib/cpp/aggregation.pxd | 5 +-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 76eb3ba3bb2..f2e17965124 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,6 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. -from enum import Enum +from enum import Enum, IntEnum import numba import numpy as np @@ -30,6 +30,7 @@ from cudf._lib.types import Interpolation cimport cudf._lib.cpp.aggregation as libcudf_aggregation cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type import cudf @@ -60,6 +61,21 @@ class AggregationKind(Enum): CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION +class CorrelationType(IntEnum): + PEARSON = ( + + libcudf_aggregation.correlation_type.PEARSON + ) + KENDALL = ( + + libcudf_aggregation.correlation_type.KENDALL + ) + SPEARMAN = ( + + libcudf_aggregation.correlation_type.SPEARMAN + ) + + cdef class Aggregation: """A Cython wrapper for aggregations. @@ -323,11 +339,18 @@ cdef class Aggregation: return agg @classmethod - def corr(cls): + def corr(cls, method, libcudf_types.size_type min_periods): cdef Aggregation agg = cls() + cdef libcudf_aggregation.correlation_type c_method = ( + ( + ( + CorrelationType[method.upper()] + ) + ) + ) agg.c_obj = move( libcudf_aggregation.make_correlation_aggregation[aggregation]( - libcudf_aggregation.correlation_type.PEARSON + c_method, min_periods )) return agg @@ -703,16 +726,23 @@ cdef class GroupbyAggregation: return agg @classmethod - def corr(cls): + def corr(cls, method, libcudf_types.size_type min_periods): cdef GroupbyAggregation agg = cls() + cdef libcudf_aggregation.correlation_type c_method = ( + ( + ( + CorrelationType[method.upper()] + ) + ) + ) agg.c_obj = move( libcudf_aggregation. make_correlation_aggregation[groupby_aggregation]( - libcudf_aggregation.correlation_type.PEARSON + c_method, min_periods )) - return agg + cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 04deeb877d1..31839ee5fcc 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2020, NVIDIA CORPORATION. - +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport ( size_type, ) +ctypedef int32_t underlying_type_t_correlation_type cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: @@ -115,4 +116,4 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: data_type output_type) except + cdef unique_ptr[T] make_correlation_aggregation[T]( - correlation_type type) except + + correlation_type type, size_type min_periods) except + From db8b47f94e002a4a08ab8b4f37ee7db21b16ceaa Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 3 Nov 2021 15:23:22 -0700 Subject: [PATCH 58/79] create new_df from grouping keys data --- python/cudf/cudf/core/groupby/groupby.py | 10 +++++++--- python/cudf/cudf/tests/test_dataframe.py | 7 ------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6e01b5f0eff..98752eb3928 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -71,6 +71,8 @@ def __init__( """ self.obj = obj self._as_index = as_index + self._by = by + self._level = level self._sort = sort self._dropna = dropna @@ -842,14 +844,16 @@ def corr(self, method="pearson", min_periods=1): # create all combinations of the struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) _cols = self.grouping.values.columns.tolist() - new_df = cudf.DataFrame({self.grouping.keys.names: self.grouping.keys}) + # breakpoint() + new_df = cudf.DataFrame._from_data(self.grouping.keys._data) new_df._data.multiindex = False for i in tuple(itertools.combinations_with_replacement(_cols, 2)): new_df[i] = cudf.DataFrame( {"x": self.obj[i[0]], "y": self.obj[i[1]]} ).to_struct() - new_gb = new_df.groupby(self.grouping) - gb_corr = new_gb.agg("corr") + new_gb = new_df.groupby(by=self._by, level=self._level) + # breakpoint() + gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) # ensure that column-pair labels are arranged in ascending order cols_list = [] for i, x in enumerate(_cols): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5ba0f94e810..23416ed63ac 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8964,10 +8964,3 @@ def test_dataframe_pearson_corr(data): expected = gdf.groupby("id").corr("pearson") actual = pdf.groupby("id").corr("pearson") assert_eq(expected, actual) - - -def test_pearson_corr_empty_dataframe(): - with pytest.raises( - ValueError, match="Grouper and object must have same length" - ): - cudf.DataFrame().corr("pearson") From 36baa30d635e40b3c76c439e751366c71b56909e Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 9 Nov 2021 13:30:52 -0800 Subject: [PATCH 59/79] updated copyright years --- python/cudf/cudf/_lib/aggregation.pxd | 2 +- python/cudf/cudf/_lib/aggregation.pyx | 2 +- python/cudf/cudf/_lib/groupby.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd index 84bcaed1b36..85a729ad2a3 100644 --- a/python/cudf/cudf/_lib/aggregation.pxd +++ b/python/cudf/cudf/_lib/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index f2e17965124..68f7101b6ee 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from enum import Enum, IntEnum diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index b093d4cf364..314542c9549 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from collections import defaultdict From 54ef35bf8cab57a37ea2ca53ec73ff46761e4772 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 9 Nov 2021 13:33:39 -0800 Subject: [PATCH 60/79] added test for nulls and unsupoorted methods --- python/cudf/cudf/tests/test_dataframe.py | 35 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 1323282e573..1ce84c2a028 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8943,17 +8943,20 @@ def test_frame_series_where_other(data): "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], }, - { - "id": ["a", "a", "b", "b", "c", "c"], - "val1": [5, 4, 6, 8, 7, 2], - "val2": [4, 5, 1, 2, 9, 5], - }, - {"id": ["a", "a", "b", "b", "c", "c"], "val": [10, 3, 4, 2, -3, 9]}, { "id": [0] * 4 + [1] * 3, "a": [10, 3, 4, 2, -3, 9, 10], "b": [10, 23, -4, 2, -3, 9, 19], - "c": [10, -23, -4, 21, -3, 19, 19], + }, + {"id": ["a", "a", "b", "b", "c", "c"], "val": [10, 3, 4, 2, -3, 9]}, + { + "id": ["a", "a", "b", "b", "c", "c"], + "val": [None, None, None, None, None, None], + }, + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [None, 4, 6, 8, None, 2], + "val2": [4, 5, None, 2, 9, None], }, ], ) @@ -8964,3 +8967,21 @@ def test_dataframe_pearson_corr(data): expected = gdf.groupby("id").corr("pearson") actual = pdf.groupby("id").corr("pearson") assert_eq(expected, actual) + + +@pytest.mark.parametrize("method", ["kendall", "spearman"]) +def test_dataframe_pearson_corr_unsupported_methods(method): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + } + ) + + with pytest.raises( + NotImplementedError, + match="Only pearson correlation is currently supported", + ): + gdf.groupby("id").corr(method) From 1e1431be0d4c5136069a237ac4b63ac9f5c4df50 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 9 Nov 2021 19:46:18 -0800 Subject: [PATCH 61/79] minor review-fixes --- python/cudf/cudf/core/groupby/groupby.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 98752eb3928..c9f69252001 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -789,12 +789,12 @@ def corr(self, method="pearson", min_periods=1): Parameters ---------- - method: Method of correlation, default 'Pearson' + method: Method of correlation Pearson: standard correlation coefficient. Kendall, Spearman correlation and callable method not yet supported. - min_periods: int, default 1 + min_periods: int, optional Minimum number of observations required per pair of columns to have a valid result. @@ -841,7 +841,8 @@ def corr(self, method="pearson", min_periods=1): raise NotImplementedError( "Only pearson correlation is currently supported" ) - # create all combinations of the struct columns-pairs to be correlated + # create expanded dataframe consisting all combinations of the + # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) _cols = self.grouping.values.columns.tolist() # breakpoint() From ab6cd9532569e5fdc65854e0d1c06e13996a93f0 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 9 Nov 2021 19:55:51 -0800 Subject: [PATCH 62/79] added tests for: invalid types, empty dataframe and multiindex. All failing --- python/cudf/cudf/tests/test_dataframe.py | 62 ++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 1ce84c2a028..c8e4024a666 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8958,14 +8958,21 @@ def test_frame_series_where_other(data): "val1": [None, 4, 6, 8, None, 2], "val2": [4, 5, None, 2, 9, None], }, + {"id": ["a"], "val1": [2], "val2": [3]}, ], ) -def test_dataframe_pearson_corr(data): +@pytest.mark.parametrize( + "min_periods", [0, 1, 2, 3, 4], +) +def test_dataframe_pearson_corr(data, min_periods): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - expected = gdf.groupby("id").corr("pearson") - actual = pdf.groupby("id").corr("pearson") + expected = gdf.groupby("id").corr( + method="pearson", min_periods=min_periods + ) + actual = pdf.groupby("id").corr(method="pearson", min_periods=min_periods) + assert_eq(expected, actual) @@ -8985,3 +8992,52 @@ def test_dataframe_pearson_corr_unsupported_methods(method): match="Only pearson correlation is currently supported", ): gdf.groupby("id").corr(method) + + +def test_pearson_corr_empty_dataframe(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + expected = gdf.groupby("id").corr("pearson") + actual = pdf.groupby("id").corr("pearson") + + assert_eq( + expected, actual + ) # fails: DataFrame.index classes are not equivalent + + +@pytest.mark.parametrize( + "data", + [ + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + ], +) +@pytest.mark.parametrize("groupby", ["id", "val1", "val2"]) +def test_pearson_corr_invalid_column_types(data, groupby): + try: + cudf.DataFrame(data).groupby(groupby).corr("pearson") + except RuntimeError as e: + if "Unsupported type-agg combination" in str(e): + raise TypeError( + "Correlation accepts only numerical column-pairs" + ) from e + + +def test_pearson_corr_multiindex_dataframe(): + gdf = cudf.DataFrame( + {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]} + ).set_index(["a", "b"]) + + expected = gdf.groupby(level="a").corr("pearson") + actual = gdf.to_pandas().groupby(level="a").corr("pearson") + + assert_eq(expected, actual) From 34d412e22916b11873bf88fe10d2d91f7dcbc11d Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 10 Nov 2021 01:28:12 -0800 Subject: [PATCH 63/79] added test for grouping by multiple columns, passes --- python/cudf/cudf/tests/test_dataframe.py | 77 +++++++++++++----------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c8e4024a666..fe800fefac4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8935,49 +8935,58 @@ def test_frame_series_where_other(data): @pytest.mark.parametrize( - "data", + "data, gkey", [ - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - { - "id": [0] * 4 + [1] * 3, - "a": [10, 3, 4, 2, -3, 9, 10], - "b": [10, 23, -4, 2, -3, 9, 19], - }, - {"id": ["a", "a", "b", "b", "c", "c"], "val": [10, 3, 4, 2, -3, 9]}, - { - "id": ["a", "a", "b", "b", "c", "c"], - "val": [None, None, None, None, None, None], - }, - { - "id": ["a", "a", "b", "b", "c", "c"], - "val1": [None, 4, 6, 8, None, 2], - "val2": [4, 5, None, 2, 9, None], - }, - {"id": ["a"], "val1": [2], "val2": [3]}, + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id", "val1", "val2"], + ), + ( + { + "id": [0] * 4 + [1] * 3, + "a": [10, 3, 4, 2, -3, 9, 10], + "b": [10, 23, -4, 2, -3, 9, 19], + }, + ["id", "a"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val": [None, None, None, None, None, None], + }, + ["id"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [None, 4, 6, 8, None, 2], + "val2": [4, 5, None, 2, 9, None], + }, + ["id"], + ), + ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]), ], ) @pytest.mark.parametrize( - "min_periods", [0, 1, 2, 3, 4], + "min_per", [0, 1, 2, 3, 4], ) -def test_dataframe_pearson_corr(data, min_periods): +def test_dataframe_pearson_corr(data, gkey, min_per): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - expected = gdf.groupby("id").corr( - method="pearson", min_periods=min_periods - ) - actual = pdf.groupby("id").corr(method="pearson", min_periods=min_periods) + expected = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + actual = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) assert_eq(expected, actual) @pytest.mark.parametrize("method", ["kendall", "spearman"]) -def test_dataframe_pearson_corr_unsupported_methods(method): +def test_pearson_corr_unsupported_methods(method): gdf = cudf.DataFrame( { "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], @@ -8994,7 +9003,7 @@ def test_dataframe_pearson_corr_unsupported_methods(method): gdf.groupby("id").corr(method) -def test_pearson_corr_empty_dataframe(): +def test_pearson_corr_empty_columns(): gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) pdf = gdf.to_pandas() @@ -9021,10 +9030,10 @@ def test_pearson_corr_empty_dataframe(): }, ], ) -@pytest.mark.parametrize("groupby", ["id", "val1", "val2"]) -def test_pearson_corr_invalid_column_types(data, groupby): +@pytest.mark.parametrize("gkey", ["id", "val1", "val2"]) +def test_pearson_corr_invalid_column_types(data, gkey): try: - cudf.DataFrame(data).groupby(groupby).corr("pearson") + cudf.DataFrame(data).groupby(gkey).corr("pearson") except RuntimeError as e: if "Unsupported type-agg combination" in str(e): raise TypeError( From b6420494f63b7622525638572c6271745723b4fc Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 10 Nov 2021 01:34:06 -0800 Subject: [PATCH 64/79] fixes multiindex to match pd for multiple groupings-cases --- python/cudf/cudf/core/groupby/groupby.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index c9f69252001..f80ff3e3d94 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -841,11 +841,11 @@ def corr(self, method="pearson", min_periods=1): raise NotImplementedError( "Only pearson correlation is currently supported" ) + # create expanded dataframe consisting all combinations of the # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) _cols = self.grouping.values.columns.tolist() - # breakpoint() new_df = cudf.DataFrame._from_data(self.grouping.keys._data) new_df._data.multiindex = False for i in tuple(itertools.combinations_with_replacement(_cols, 2)): @@ -853,8 +853,8 @@ def corr(self, method="pearson", min_periods=1): {"x": self.obj[i[0]], "y": self.obj[i[1]]} ).to_struct() new_gb = new_df.groupby(by=self._by, level=self._level) - # breakpoint() gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) + # ensure that column-pair labels are arranged in ascending order cols_list = [] for i, x in enumerate(_cols): @@ -867,22 +867,23 @@ def corr(self, method="pearson", min_periods=1): cols_list[i : i + len(_cols)] for i in range(0, len(cols_list), len(_cols)) ] - # interleave: combine the correlation results of each column-pair + + # interleave: combine the correlation results for each column-pair # into a single column res = cudf.DataFrame() for i, x in zip(cols_split, _cols): ic = gb_corr.loc[:, i].interleave_columns() res[x] = ic + # create a multiindex for the groupby correlated dataframe, # to match pandas behavior _idx = gb_corr._index.to_pandas().values.tolist() _index = cudf.DataFrame( - { - gb_corr.index.name: sorted(_idx * len(_cols)), - None: _cols * (len(gb_corr.index)), - } + sorted(_idx * len(_cols)), columns=gb_corr.index.names ) + _index[None] = _cols * (len(gb_corr.index)) res.index = _index + return res def var(self, ddof=1): From 0c2e17ec3ee59d8eee7aa1fd9fd5210aadad6814 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 10 Nov 2021 13:33:04 -0800 Subject: [PATCH 65/79] changes:call with ashwin-create MI for non empty results, capture runtime error, etc --- python/cudf/cudf/core/groupby/groupby.py | 46 +++++++++++++++++------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f80ff3e3d94..6bc04094d66 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -13,7 +13,8 @@ from cudf._typing import DataFrameOrSeries from cudf.api.types import is_list_like from cudf.core.abc import Serializable -from cudf.core.column.column import arange +from cudf.core.column.column import arange, as_column +from cudf.core.index import _index_from_data from cudf.utils.utils import GetAttrGetItemMixin, cached_property @@ -845,15 +846,32 @@ def corr(self, method="pearson", min_periods=1): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) + # breakpoint() _cols = self.grouping.values.columns.tolist() - new_df = cudf.DataFrame._from_data(self.grouping.keys._data) - new_df._data.multiindex = False + + if self._by: + new_df = cudf.DataFrame._from_data(self.grouping.keys._data) + new_df._data.multiindex = False + else: + new_df = cudf.DataFrame._from_data( + {}, index=_index_from_data(self.grouping.keys._data) + ) + for i in tuple(itertools.combinations_with_replacement(_cols, 2)): - new_df[i] = cudf.DataFrame( - {"x": self.obj[i[0]], "y": self.obj[i[1]]} + new_df._data[i] = cudf.DataFrame._from_data( + {"x": self.obj._data[i[0]], "y": self.obj._data[i[1]]} ).to_struct() new_gb = new_df.groupby(by=self._by, level=self._level) - gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) + try: + + gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) + except RuntimeError as e: + if "Unsupported type-agg combination" in str(e): + raise TypeError( + "Correlation accepts only numerical column-pairs" + ) from e + else: + raise # ensure that column-pair labels are arranged in ascending order cols_list = [] @@ -877,12 +895,16 @@ def corr(self, method="pearson", min_periods=1): # create a multiindex for the groupby correlated dataframe, # to match pandas behavior - _idx = gb_corr._index.to_pandas().values.tolist() - _index = cudf.DataFrame( - sorted(_idx * len(_cols)), columns=gb_corr.index.names - ) - _index[None] = _cols * (len(gb_corr.index)) - res.index = _index + _idx = gb_corr.index.repeat(len(_cols)) + idx_sort_order = _idx._get_sorted_inds() + _idx = _idx._gather(idx_sort_order) + # breakpoint() + if len(gb_corr): + # TO-DO: Should the operation below be done on the CPU instead? + _idx._data[None] = as_column( + cudf.Series(_cols).tile(len(gb_corr.index)) + ) + res.index = _index_from_data(_idx._data) return res From 124c576bc761b7de9bd16be1f272bfcce5bf06cb Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 10 Nov 2021 13:34:32 -0800 Subject: [PATCH 66/79] all tests passing now --- python/cudf/cudf/tests/test_dataframe.py | 27 +++++++++++------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fe800fefac4..08bb54d3423 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8975,12 +8975,12 @@ def test_frame_series_where_other(data): @pytest.mark.parametrize( "min_per", [0, 1, 2, 3, 4], ) -def test_dataframe_pearson_corr(data, gkey, min_per): +def test_pearson_corr_passing(data, gkey, min_per): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - expected = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) - actual = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) assert_eq(expected, actual) @@ -9007,12 +9007,12 @@ def test_pearson_corr_empty_columns(): gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) pdf = gdf.to_pandas() - expected = gdf.groupby("id").corr("pearson") - actual = pdf.groupby("id").corr("pearson") + actual = gdf.groupby("id").corr("pearson") + expected = pdf.groupby("id").corr("pearson") assert_eq( - expected, actual - ) # fails: DataFrame.index classes are not equivalent + expected, actual, check_dtype=False, check_index_type=False, + ) @pytest.mark.parametrize( @@ -9032,13 +9032,10 @@ def test_pearson_corr_empty_columns(): ) @pytest.mark.parametrize("gkey", ["id", "val1", "val2"]) def test_pearson_corr_invalid_column_types(data, gkey): - try: + with pytest.raises( + TypeError, match="Correlation accepts only numerical column-pairs", + ): cudf.DataFrame(data).groupby(gkey).corr("pearson") - except RuntimeError as e: - if "Unsupported type-agg combination" in str(e): - raise TypeError( - "Correlation accepts only numerical column-pairs" - ) from e def test_pearson_corr_multiindex_dataframe(): @@ -9046,7 +9043,7 @@ def test_pearson_corr_multiindex_dataframe(): {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]} ).set_index(["a", "b"]) - expected = gdf.groupby(level="a").corr("pearson") - actual = gdf.to_pandas().groupby(level="a").corr("pearson") + actual = gdf.groupby(level="a").corr("pearson") + expected = gdf.to_pandas().groupby(level="a").corr("pearson") assert_eq(expected, actual) From 23bfff7c0bfae1fc12c400cdfe22667351e757d9 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 11 Nov 2021 21:33:29 -0800 Subject: [PATCH 67/79] added corr() aggregation to cudf GroupBy docs --- docs/cudf/source/basics/groupby.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index 04c4d42fa2a..05a631bd312 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -128,6 +128,30 @@ Aggregations on groups is supported via the ``agg`` method: 1 4 1 2.0 2 5 2 4.5 + >>> gdf + id val1 val2 val3 + 0 a 5 4 4 + 1 a 4 5 5 + 2 a 6 6 6 + 3 b 4 1 1 + 4 b 8 2 2 + 5 b 7 9 9 + 6 c 4 8 8 + 7 c 5 5 5 + 8 c 2 1 1 + >>> gdf.groupby("id").corr(method="pearson") + val1 val2 val3 + id + a val1 1.000000 0.500000 0.500000 + val2 0.500000 1.000000 1.000000 + val3 0.500000 1.000000 1.000000 + b val1 1.000000 0.385727 0.385727 + val2 0.385727 1.000000 1.000000 + val3 0.385727 1.000000 1.000000 + c val1 1.000000 0.714575 0.714575 + val2 0.714575 1.000000 1.000000 + val3 0.714575 1.000000 1.000000 + The following table summarizes the available aggregations and the types that support them: @@ -169,6 +193,9 @@ that support them: +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ | unique | ✅ | ✅ | ✅ | ✅ | | | | | +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | corr | ✅ | | | | | | | | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + GroupBy apply ------------- From ee5d30ede17c153b38fd0975d7be4dcee63e78b9 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 11 Nov 2021 21:35:35 -0800 Subject: [PATCH 68/79] fixed copyright years in aggregation.pxd --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 31839ee5fcc..3982b4fecbb 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string From f3b85d1702a57c973b6a03a7126af806a9012867 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Thu, 11 Nov 2021 21:37:39 -0800 Subject: [PATCH 69/79] minor review fixes- list comprehension, rm breakpoints --- python/cudf/cudf/core/groupby/groupby.py | 28 ++++++++++-------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6bc04094d66..39a79f3cb7b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -846,7 +846,6 @@ def corr(self, method="pearson", min_periods=1): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - # breakpoint() _cols = self.grouping.values.columns.tolist() if self._by: @@ -862,11 +861,11 @@ def corr(self, method="pearson", min_periods=1): {"x": self.obj._data[i[0]], "y": self.obj._data[i[1]]} ).to_struct() new_gb = new_df.groupby(by=self._by, level=self._level) - try: + try: gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) except RuntimeError as e: - if "Unsupported type-agg combination" in str(e): + if "Unsupported groupby reduction type-agg combination" in str(e): raise TypeError( "Correlation accepts only numerical column-pairs" ) from e @@ -874,13 +873,11 @@ def corr(self, method="pearson", min_periods=1): raise # ensure that column-pair labels are arranged in ascending order - cols_list = [] - for i, x in enumerate(_cols): - for j, y in enumerate(_cols): - if i > j: - cols_list.append((_cols[j], _cols[i])) - else: - cols_list.append((_cols[i], _cols[j])) + cols_list = [ + (_cols[j], _cols[i]) if i > j else (_cols[i], _cols[j]) + for j, y in enumerate(_cols) + for i, x in enumerate(_cols) + ] cols_split = [ cols_list[i : i + len(_cols)] for i in range(0, len(cols_list), len(_cols)) @@ -895,16 +892,15 @@ def corr(self, method="pearson", min_periods=1): # create a multiindex for the groupby correlated dataframe, # to match pandas behavior - _idx = gb_corr.index.repeat(len(_cols)) - idx_sort_order = _idx._get_sorted_inds() - _idx = _idx._gather(idx_sort_order) - # breakpoint() + unsorted_idx = gb_corr.index.repeat(len(_cols)) + idx_sort_order = unsorted_idx._get_sorted_inds() + sorted_idx = unsorted_idx._gather(idx_sort_order) if len(gb_corr): # TO-DO: Should the operation below be done on the CPU instead? - _idx._data[None] = as_column( + sorted_idx._data[None] = as_column( cudf.Series(_cols).tile(len(gb_corr.index)) ) - res.index = _index_from_data(_idx._data) + res.index = _index_from_data(sorted_idx._data) return res From b22392595766451b626a1e9e07015265b662cd35 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Mon, 15 Nov 2021 13:23:16 -0800 Subject: [PATCH 70/79] apply @isvoid suggestions Co-authored-by: Michael Wang --- python/cudf/cudf/core/groupby/groupby.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 23a9947d440..53f242b5d35 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -845,19 +845,13 @@ def corr(self, method="pearson", min_periods=1): # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) _cols = self.grouping.values.columns.tolist() - if self._by: - new_df = cudf.DataFrame._from_data(self.grouping.keys._data) - new_df._data.multiindex = False - else: - new_df = cudf.DataFrame._from_data( - {}, index=_index_from_data(self.grouping.keys._data) - ) + new_df_data = {} for i in tuple(itertools.combinations_with_replacement(_cols, 2)): - new_df._data[i] = cudf.DataFrame._from_data( + new_df_data[i] = cudf.DataFrame._from_data( {"x": self.obj._data[i[0]], "y": self.obj._data[i[1]]} ).to_struct() - new_gb = new_df.groupby(by=self._by, level=self._level) + new_gb = new_gb = cudf.DataFrame._from_data(new_df_data).groupby(by=self.grouping.keys) try: gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) From 20c9273754dbd9680dcf2178ec7e46f871e6b3a2 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 16 Nov 2021 18:35:21 -0800 Subject: [PATCH 71/79] reversed copyright fix in cudf/_lib/aggregation.pxd --- docs/cudf/source/basics/groupby.rst | 31 +++++++---------------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index 05a631bd312..0107b4b27ca 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -127,30 +127,13 @@ Aggregations on groups is supported via the ``agg`` method: a 1 4 1 2.0 2 5 2 4.5 - - >>> gdf - id val1 val2 val3 - 0 a 5 4 4 - 1 a 4 5 5 - 2 a 6 6 6 - 3 b 4 1 1 - 4 b 8 2 2 - 5 b 7 9 9 - 6 c 4 8 8 - 7 c 5 5 5 - 8 c 2 1 1 - >>> gdf.groupby("id").corr(method="pearson") - val1 val2 val3 - id - a val1 1.000000 0.500000 0.500000 - val2 0.500000 1.000000 1.000000 - val3 0.500000 1.000000 1.000000 - b val1 1.000000 0.385727 0.385727 - val2 0.385727 1.000000 1.000000 - val3 0.385727 1.000000 1.000000 - c val1 1.000000 0.714575 0.714575 - val2 0.714575 1.000000 1.000000 - val3 0.714575 1.000000 1.000000 + >>> df.groupby("a").corr(method="pearson") + b c + a + 1 b 1.000000 0.866025 + c 0.866025 1.000000 + 2 b 1.000000 1.000000 + c 1.000000 1.000000 The following table summarizes the available aggregations and the types that support them: From af08150f037c951c98f9b8f21d8a1c94e1a1d119 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Tue, 16 Nov 2021 18:50:54 -0800 Subject: [PATCH 72/79] use existing dataframe for corr() example --- docs/cudf/source/basics/groupby.rst | 31 +++++++---------------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index 05a631bd312..1596743b30f 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -127,30 +127,13 @@ Aggregations on groups is supported via the ``agg`` method: a 1 4 1 2.0 2 5 2 4.5 - - >>> gdf - id val1 val2 val3 - 0 a 5 4 4 - 1 a 4 5 5 - 2 a 6 6 6 - 3 b 4 1 1 - 4 b 8 2 2 - 5 b 7 9 9 - 6 c 4 8 8 - 7 c 5 5 5 - 8 c 2 1 1 - >>> gdf.groupby("id").corr(method="pearson") - val1 val2 val3 - id - a val1 1.000000 0.500000 0.500000 - val2 0.500000 1.000000 1.000000 - val3 0.500000 1.000000 1.000000 - b val1 1.000000 0.385727 0.385727 - val2 0.385727 1.000000 1.000000 - val3 0.385727 1.000000 1.000000 - c val1 1.000000 0.714575 0.714575 - val2 0.714575 1.000000 1.000000 - val3 0.714575 1.000000 1.000000 + >>> df.groupby("a").corr(method="pearson") + b c + a + 1 b 1.000000 0.866025 + c 0.866025 1.000000 + 2 b 1.000000 1.000000 + c 1.000000 1.000000 The following table summarizes the available aggregations and the types that support them: From df616d0e37b34db02b7f49122bef7f1aeb2120e8 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 17 Nov 2021 11:37:16 -0800 Subject: [PATCH 73/79] noted that corr() is supported with decimals in the cudf docs --- docs/cudf/source/basics/groupby.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index 0107b4b27ca..f3269768025 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -176,7 +176,7 @@ that support them: +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ | unique | ✅ | ✅ | ✅ | ✅ | | | | | +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ - | corr | ✅ | | | | | | | | + | corr | ✅ | | | | | | | ✅ | +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ From 94f198446be9d061e45835d3f98018d46889d64c Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 17 Nov 2021 11:40:40 -0800 Subject: [PATCH 74/79] reversed copyright year in cudf/_lib/aggregation.pxd --- python/cudf/cudf/_lib/aggregation.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd index 85a729ad2a3..84bcaed1b36 100644 --- a/python/cudf/cudf/_lib/aggregation.pxd +++ b/python/cudf/cudf/_lib/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr From 663a71b9ecf22c08b654428224f8560d5e4f3d61 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Wed, 17 Nov 2021 11:59:19 -0800 Subject: [PATCH 75/79] addressed all reviews for groupby.py --- python/cudf/cudf/core/groupby/groupby.py | 32 ++++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 53f242b5d35..24176ba7321 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -787,10 +787,8 @@ def corr(self, method="pearson", min_periods=1): Parameters ---------- - method: Method of correlation - Pearson: standard correlation coefficient. - Kendall, Spearman correlation and callable method - not yet supported. + method: {"pearson" (default), "kendall", "spearman"} or callable + Currently only the pearson correlation coefficient is supported. min_periods: int, optional Minimum number of observations required per pair of columns @@ -846,12 +844,13 @@ def corr(self, method="pearson", min_periods=1): _cols = self.grouping.values.columns.tolist() new_df_data = {} - - for i in tuple(itertools.combinations_with_replacement(_cols, 2)): - new_df_data[i] = cudf.DataFrame._from_data( - {"x": self.obj._data[i[0]], "y": self.obj._data[i[1]]} + for x, y in itertools.combinations_with_replacement(_cols, 2): + new_df_data[(x, y)] = cudf.DataFrame._from_data( + {"x": self.obj._data[x], "y": self.obj._data[y]} ).to_struct() - new_gb = new_gb = cudf.DataFrame._from_data(new_df_data).groupby(by=self.grouping.keys) + new_gb = cudf.DataFrame._from_data(new_df_data).groupby( + by=self.grouping.keys + ) try: gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) @@ -859,9 +858,8 @@ def corr(self, method="pearson", min_periods=1): if "Unsupported groupby reduction type-agg combination" in str(e): raise TypeError( "Correlation accepts only numerical column-pairs" - ) from e - else: - raise + ) + raise # ensure that column-pair labels are arranged in ascending order cols_list = [ @@ -876,10 +874,12 @@ def corr(self, method="pearson", min_periods=1): # interleave: combine the correlation results for each column-pair # into a single column - res = cudf.DataFrame() - for i, x in zip(cols_split, _cols): - ic = gb_corr.loc[:, i].interleave_columns() - res[x] = ic + res = cudf.DataFrame._from_data( + { + x: gb_corr.loc[:, i].interleave_columns() + for i, x in zip(cols_split, _cols) + } + ) # create a multiindex for the groupby correlated dataframe, # to match pandas behavior From 982d79d794c7bd8712fb3a2e16402b718d57e27b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 18 Nov 2021 09:22:09 -0500 Subject: [PATCH 76/79] Update python/cudf/cudf/core/groupby/groupby.py Co-authored-by: Michael Wang --- python/cudf/cudf/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 24176ba7321..607d455ee65 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -863,7 +863,7 @@ def corr(self, method="pearson", min_periods=1): # ensure that column-pair labels are arranged in ascending order cols_list = [ - (_cols[j], _cols[i]) if i > j else (_cols[i], _cols[j]) + (y, x) if i > j else (x, y) for j, y in enumerate(_cols) for i, x in enumerate(_cols) ] From 53465bbbc8c8452969c261b623e125b281889162 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 29 Nov 2021 16:11:18 -0800 Subject: [PATCH 77/79] addressed Vyas reviews --- python/cudf/cudf/core/groupby/groupby.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0135b28547f..3c6c00d39a3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -787,8 +787,9 @@ def corr(self, method="pearson", min_periods=1): Parameters ---------- - method: {"pearson" (default), "kendall", "spearman"} or callable - Currently only the pearson correlation coefficient is supported. + method: {"pearson", "kendall", "spearman"} or callable, + default "pearson". Currently only the pearson correlation + coefficient is supported. min_periods: int, optional Minimum number of observations required per pair of columns @@ -830,10 +831,9 @@ def corr(self, method="pearson", min_periods=1): c val1 1.000000 0.714575 0.714575 val2 0.714575 1.000000 1.000000 val3 0.714575 1.000000 1.000000 - """ - if not method.lower() in ["pearson"]: + if not method.lower() in ("pearson",): raise NotImplementedError( "Only pearson correlation is currently supported" ) @@ -841,7 +841,9 @@ def corr(self, method="pearson", min_periods=1): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) + # breakpoint() _cols = self.grouping.values.columns.tolist() + len_cols = len(_cols) new_df_data = {} for x, y in itertools.combinations_with_replacement(_cols, 2): @@ -868,8 +870,8 @@ def corr(self, method="pearson", min_periods=1): for i, x in enumerate(_cols) ] cols_split = [ - cols_list[i : i + len(_cols)] - for i in range(0, len(cols_list), len(_cols)) + cols_list[i : i + len_cols] + for i in range(0, len(cols_list), len_cols) ] # interleave: combine the correlation results for each column-pair @@ -883,7 +885,7 @@ def corr(self, method="pearson", min_periods=1): # create a multiindex for the groupby correlated dataframe, # to match pandas behavior - unsorted_idx = gb_corr.index.repeat(len(_cols)) + unsorted_idx = gb_corr.index.repeat(len_cols) idx_sort_order = unsorted_idx._get_sorted_inds() sorted_idx = unsorted_idx._gather(idx_sort_order) if len(gb_corr): From f36ab44c5485d1743a56692941c5f12a59d840c2 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 29 Nov 2021 16:20:15 -0800 Subject: [PATCH 78/79] updated API link with corr in api_docs/groupby.rst --- docs/cudf/source/api_docs/groupby.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst index cf08d1d791b..575d7442cdf 100644 --- a/docs/cudf/source/api_docs/groupby.rst +++ b/docs/cudf/source/api_docs/groupby.rst @@ -59,6 +59,7 @@ Computations / descriptive stats GroupBy.std GroupBy.sum GroupBy.var + GroupBy.corr The following methods are available in both ``SeriesGroupBy`` and ``DataFrameGroupBy`` objects, but may differ slightly, usually in that From 28d0a0a1b0fd86ceb71e91d34ca6b25d513fa625 Mon Sep 17 00:00:00 2001 From: Sheilah Date: Mon, 29 Nov 2021 16:35:12 -0800 Subject: [PATCH 79/79] . --- python/cudf/cudf/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3c6c00d39a3..f1d622362e2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -15,7 +15,7 @@ from cudf.api.types import is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column -from cudf.core.index import _index_from_data +from cudf.core.multiindex import MultiIndex from cudf.utils.utils import GetAttrGetItemMixin, cached_property @@ -841,7 +841,6 @@ def corr(self, method="pearson", min_periods=1): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be correlated # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - # breakpoint() _cols = self.grouping.values.columns.tolist() len_cols = len(_cols) @@ -893,7 +892,7 @@ def corr(self, method="pearson", min_periods=1): sorted_idx._data[None] = as_column( cudf.Series(_cols).tile(len(gb_corr.index)) ) - res.index = _index_from_data(sorted_idx._data) + res.index = MultiIndex._from_data(sorted_idx._data) return res