diff --git a/cpp/benchmarks/groupby/group_shift_benchmark.cu b/cpp/benchmarks/groupby/group_shift_benchmark.cu index 6e48fab7220..81afcdd80e1 100644 --- a/cpp/benchmarks/groupby/group_shift_benchmark.cu +++ b/cpp/benchmarks/groupby/group_shift_benchmark.cu @@ -57,8 +57,8 @@ void BM_group_shift(benchmark::State& state) cudf::groupby::groupby gb_obj(cudf::table_view({keys})); - cudf::size_type offset = - static_cast(column_size / float(num_groups) * 0.5); // forward shift half way + std::vector offsets{ + static_cast(column_size / float(num_groups) * 0.5)}; // forward shift half way // null fill value auto fill_value = cudf::make_default_constructed_scalar(cudf::data_type(cudf::type_id::INT64)); // non null fill value @@ -66,7 +66,7 @@ void BM_group_shift(benchmark::State& state) for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = gb_obj.shift(vals, offset, *fill_value); + auto result = gb_obj.shift(cudf::table_view{{vals}}, offsets, {*fill_value}); } } diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 0a08c978715..85c469f58f8 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -228,37 +228,57 @@ class groupby { /** * @brief Performs grouped shifts for specified values. * - * For each group, `i`th element is determined by the `i - offset`th element - * of the group. If `i - offset < 0 or >= group_size`, the value is determined by - * @p fill_value. + * In `j`th column, for each group, `i`th element is determined by the `i - offsets[j]`th + * element of the group. If `i - offsets[j] < 0 or >= group_size`, the value is determined by + * @p fill_values[j]. + * + * @note The first returned table stores the keys passed to the groupby object. Row `i` of the key + * table corresponds to the group labels of row `i` in the shifted columns. The key order in + * each group matches the input order. The order of each group is arbitrary. The group order + * in successive calls to `groupby::shifts` may be different. * * Example: * @code{.pseudo} - * keys: {1 1 1 1 2 2 2} - * values: {3 1 4 7 9 2 5} - * offset: 2 - * fill_value: @ - * result: {@ @ 3 1 @ @ 9} + * keys: {1 4 1 3 4 4 1} + * {1 2 1 3 2 2 1} + * values: {3 9 1 4 2 5 7} + * {"a" "c" "bb" "ee" "z" "x" "d"} + * offset: {2, -1} + * fill_value: {@, @} + * result (group order maybe different): + * keys: {3 1 1 1 4 4 4} + * {3 1 1 1 2 2 2} + * values: {@ @ @ 3 @ @ 9} + * {@ "bb" "d" @ "z" "x" @} + * * ------------------------------------------------- - * keys: {1 1 1 1 2 2 2} - * values: {3 1 4 7 9 2 5} - * offset: -2 - * fill_value: -1 - * result: {4 7 -1 -1 5 -1 -1} + * keys: {1 4 1 3 4 4 1} + * {1 2 1 3 2 2 1} + * values: {3 9 1 4 2 5 7} + * {"a" "c" "bb" "ee" "z" "x" "d"} + * offset: {-2, 1} + * fill_value: {-1, "42"} + * result (group order maybe different): + * keys: {3 1 1 1 4 4 4} + * {3 1 1 1 2 2 2} + * values: {-1 7 -1 -1 5 -1 -1} + * {"42" "42" "a" "bb" "42" "c" "z"} + * * @endcode * - * @param values Column to be shifted - * @param offset The off set by which to shift the input - * @param fill_value Fill value for indeterminable outputs + * @param values Table whose columns to be shifted + * @param offsets The offsets by which to shift the input + * @param fill_values Fill values for indeterminable outputs * @param mr Device memory resource used to allocate the returned table and columns' device memory - * @return Pair containing the table with each group's key and the column shifted + * @return Pair containing the tables with each group's key and the columns shifted * - * @throws cudf::logic_error if @p fill_value dtype does not match @p input dtype + * @throws cudf::logic_error if @p fill_value[i] dtype does not match @p values[i] dtype for + * `i`th column */ - std::pair, std::unique_ptr> shift( - column_view const& values, - size_type offset, - scalar const& fill_value, + std::pair, std::unique_ptr> shift( + table_view const& values, + host_span offsets, + std::vector> const& fill_values, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 72fbabe100c..b265e1c3112 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -35,6 +35,7 @@ #include #include +#include #include #include @@ -262,23 +263,36 @@ detail::sort::sort_groupby_helper& groupby::helper() return *_helper; }; -std::pair, std::unique_ptr> groupby::shift( - column_view const& values, - size_type offset, - scalar const& fill_value, +std::pair, std::unique_ptr
> groupby::shift( + table_view const& values, + host_span offsets, + std::vector> const& fill_values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(values.type() == fill_value.type(), - "values and fill_value should have the same type."); + CUDF_EXPECTS(values.num_columns() == static_cast(fill_values.size()), + "Mismatch number of fill_values and columns."); + CUDF_EXPECTS( + std::all_of(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(values.num_columns()), + [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }), + "values and fill_value should have the same type."); - auto stream = rmm::cuda_stream_default; - auto grouped_values = helper().grouped_values(values, stream); + auto stream = rmm::cuda_stream_default; + std::vector> results; + auto const& group_offsets = helper().group_offsets(stream); + std::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(values.num_columns()), + std::back_inserter(results), + [&](size_type i) { + auto grouped_values = helper().grouped_values(values.column(i), stream); + return cudf::detail::segmented_shift( + grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr); + }); - return std::make_pair( - helper().sorted_keys(stream, mr), - std::move(cudf::detail::segmented_shift( - grouped_values->view(), helper().group_offsets(stream), offset, fill_value, stream, mr))); + return std::make_pair(helper().sorted_keys(stream, mr), + std::make_unique(std::move(results))); } } // namespace groupby diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp index 5974a9408b2..3a934071427 100644 --- a/cpp/tests/groupby/shift_tests.cpp +++ b/cpp/tests/groupby/shift_tests.cpp @@ -16,8 +16,10 @@ #include #include +#include #include +#include #include #include #include @@ -33,15 +35,16 @@ struct groupby_shift_fixed_width_test : public BaseFixture { TYPED_TEST_CASE(groupby_shift_fixed_width_test, FixedWidthTypes); template -void test_groupby_shift_fixed_width(fixed_width_column_wrapper const& key, - fixed_width_column_wrapper const& value, - size_type offset, - scalar const& fill_value, - fixed_width_column_wrapper const& expected) +void test_groupby_shift_fixed_width_single(fixed_width_column_wrapper const& key, + fixed_width_column_wrapper const& value, + size_type offset, + scalar const& fill_value, + fixed_width_column_wrapper const& expected) { groupby::groupby gb_obj(table_view({key})); - auto got = gb_obj.shift(value, offset, fill_value); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got.second, expected); + std::vector offsets{offset}; + auto got = gb_obj.shift(table_view{{value}}, offsets, {fill_value}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL((*got.second).view().column(0), expected); } TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithoutNull_NullScalar) @@ -54,7 +57,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithoutNull_NullScalar) size_type offset = 2; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithNull_NullScalar) @@ -67,7 +70,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithNull_NullScalar) size_type offset = 2; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithoutNull_ValidScalar) @@ -80,7 +83,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithoutNull_ValidScalar) size_type offset = 3; auto slr = cudf::scalar_type_t(make_type_param_scalar(42), true); - test_groupby_shift_fixed_width(key, val, offset, slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithNull_ValidScalar) @@ -95,7 +98,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithNull_ValidScalar) size_type offset = 3; auto slr = cudf::scalar_type_t(make_type_param_scalar(42), true); - test_groupby_shift_fixed_width(key, val, offset, slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithoutNull_NullScalar) @@ -108,7 +111,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithoutNull_NullScalar) size_type offset = -1; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithNull_NullScalar) @@ -121,7 +124,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithNull_NullScalar) size_type offset = -1; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithoutNull_ValidScalar) @@ -134,7 +137,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithoutNull_ValidScalar) size_type offset = -5; auto slr = cudf::scalar_type_t(make_type_param_scalar(42), true); - test_groupby_shift_fixed_width(key, val, offset, slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithNull_ValidScalar) @@ -149,7 +152,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithNull_ValidScalar) size_type offset = -1; auto slr = cudf::scalar_type_t(make_type_param_scalar(42), true); - test_groupby_shift_fixed_width(key, val, offset, slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, ZeroShiftNullScalar) @@ -162,7 +165,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, ZeroShiftNullScalar) size_type offset = 0; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, ZeroShiftValidScalar) @@ -175,7 +178,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, ZeroShiftValidScalar) size_type offset = 0; auto slr = cudf::scalar_type_t(make_type_param_scalar(42), true); - test_groupby_shift_fixed_width(key, val, offset, slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, VeryLargeForwardOffset) @@ -189,7 +192,7 @@ TYPED_TEST(groupby_shift_fixed_width_test, VeryLargeForwardOffset) size_type offset = 1024; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } TYPED_TEST(groupby_shift_fixed_width_test, VeryLargeBackwardOffset) @@ -203,21 +206,22 @@ TYPED_TEST(groupby_shift_fixed_width_test, VeryLargeBackwardOffset) size_type offset = -1024; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_fixed_width(key, val, offset, *slr, expected); + test_groupby_shift_fixed_width_single(key, val, offset, *slr, expected); } struct groupby_shift_string_test : public BaseFixture { }; -void test_groupby_shift_string(fixed_width_column_wrapper const& key, - strings_column_wrapper const& value, - size_type offset, - scalar const& fill_value, - strings_column_wrapper const& expected) +void test_groupby_shift_string_single(fixed_width_column_wrapper const& key, + strings_column_wrapper const& value, + size_type offset, + scalar const& fill_value, + strings_column_wrapper const& expected) { groupby::groupby gb_obj(table_view({key})); - auto got = gb_obj.shift(value, offset, fill_value); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got.second, expected); + std::vector offsets{offset}; + auto got = gb_obj.shift(table_view{{value}}, offsets, {fill_value}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL((*got.second).view().column(0), expected); } TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_NullScalar) @@ -228,7 +232,7 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_NullScalar) size_type offset = 1; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, ForwardShiftWithNull_NullScalar) @@ -239,7 +243,7 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithNull_NullScalar) size_type offset = 2; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_ValidScalar) @@ -251,7 +255,7 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_ValidScalar) size_type offset = 2; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, ForwardShiftWithNull_ValidScalar) @@ -263,7 +267,7 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithNull_ValidScalar) size_type offset = 1; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_NullScalar) @@ -275,7 +279,7 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_NullScalar) size_type offset = -3; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, BackwardShiftWithNull_NullScalar) @@ -287,7 +291,7 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithNull_NullScalar) size_type offset = -1; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_ValidScalar) @@ -299,7 +303,7 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_ValidScalar) size_type offset = -4; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, BackwardShiftWithNull_ValidScalar) @@ -311,7 +315,7 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithNull_ValidScalar) size_type offset = -2; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, ZeroShiftNullScalar) @@ -323,7 +327,7 @@ TEST_F(groupby_shift_string_test, ZeroShiftNullScalar) size_type offset = 0; auto slr = cudf::make_default_constructed_scalar(column_view(val).type()); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, ZeroShiftValidScalar) @@ -335,7 +339,7 @@ TEST_F(groupby_shift_string_test, ZeroShiftValidScalar) size_type offset = 0; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, VeryLargeForwardOffset) @@ -347,7 +351,7 @@ TEST_F(groupby_shift_string_test, VeryLargeForwardOffset) size_type offset = 1024; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); } TEST_F(groupby_shift_string_test, VeryLargeBackwardOffset) @@ -359,7 +363,116 @@ TEST_F(groupby_shift_string_test, VeryLargeBackwardOffset) size_type offset = -1024; auto slr = cudf::make_string_scalar("42"); - test_groupby_shift_string(key, val, offset, *slr, expected); + test_groupby_shift_string_single(key, val, offset, *slr, expected); +} + +template +struct groupby_shift_mixed_test : public BaseFixture { +}; + +TYPED_TEST_CASE(groupby_shift_mixed_test, FixedWidthTypes); + +void test_groupby_shift_multi(fixed_width_column_wrapper const& key, + table_view const& value, + std::vector offsets, + std::vector> fill_values, + table_view const& expected) +{ + groupby::groupby gb_obj(table_view({key})); + auto got = gb_obj.shift(value, offsets, fill_values); + CUDF_TEST_EXPECT_TABLES_EQUAL((*got.second).view(), expected); +} + +TYPED_TEST(groupby_shift_mixed_test, NoFill) +{ + fixed_width_column_wrapper key{1, 2, 1, 2, 2, 1, 1}; + strings_column_wrapper v1{"a", "bb", "cc", "d", "eee", "f", "gg"}; + fixed_width_column_wrapper v2{1, 2, 3, 4, 5, 6, 7}; + table_view value{{v1, v2}}; + + strings_column_wrapper e1({"", "", "a", "cc", "", "", "bb"}, {0, 0, 1, 1, 0, 0, 1}); + fixed_width_column_wrapper e2({-1, 1, 3, 6, -1, 2, 4}, {0, 1, 1, 1, 0, 1, 1}); + table_view expected{{e1, e2}}; + + std::vector offset{2, 1}; + auto slr1 = cudf::make_default_constructed_scalar(column_view(v1).type()); + auto slr2 = cudf::make_default_constructed_scalar(column_view(v2).type()); + std::vector> fill_values{*slr1, *slr2}; + + test_groupby_shift_multi(key, value, offset, fill_values, expected); +} + +TYPED_TEST(groupby_shift_mixed_test, Fill) +{ + fixed_width_column_wrapper key{1, 2, 1, 2, 2, 1, 1}; + strings_column_wrapper v1{"a", "bb", "cc", "d", "eee", "f", "gg"}; + fixed_width_column_wrapper v2{1, 2, 3, 4, 5, 6, 7}; + table_view value{{v1, v2}}; + + strings_column_wrapper e1({"cc", "f", "gg", "42", "d", "eee", "42"}); + fixed_width_column_wrapper e2({6, 7, 42, 42, 5, 42, 42}); + table_view expected{{e1, e2}}; + + std::vector offset{-1, -2}; + + auto slr1 = cudf::make_string_scalar("42"); + auto slr2 = cudf::scalar_type_t(make_type_param_scalar(42), true); + std::vector> fill_values{*slr1, slr2}; + + test_groupby_shift_multi(key, value, offset, fill_values, expected); +} + +struct groupby_shift_fixed_point_type_test : public BaseFixture { +}; + +TEST_F(groupby_shift_fixed_point_type_test, Matching) +{ + fixed_width_column_wrapper key{2, 3, 4, 4, 3, 2, 2, 4}; + fixed_point_column_wrapper v1{{10, 10, 40, 40, 20, 20, 30, 40}, numeric::scale_type{-1}}; + fixed_point_column_wrapper v2{{5, 5, 8, 8, 6, 7, 9, 7}, numeric::scale_type{3}}; + table_view value{{v1, v2}}; + + std::vector offset{-3, 1}; + auto slr1 = make_fixed_point_scalar(-42, numeric::scale_type{-1}); + auto slr2 = make_fixed_point_scalar(42, numeric::scale_type{3}); + std::vector> fill_values{*slr1, *slr2}; + + fixed_point_column_wrapper e1{{-42, -42, -42, -42, -42, -42, -42, -42}, + numeric::scale_type{-1}}; + fixed_point_column_wrapper e2{{42, 5, 7, 42, 5, 42, 8, 8}, numeric::scale_type{3}}; + table_view expected{{e1, e2}}; + + test_groupby_shift_multi(key, value, offset, fill_values, expected); +} + +TEST_F(groupby_shift_fixed_point_type_test, MismatchScaleType) +{ + fixed_width_column_wrapper key{2, 3, 4, 4, 3, 2, 2, 4}; + fixed_point_column_wrapper v1{{10, 10, 40, 40, 20, 20, 30, 40}, numeric::scale_type{-1}}; + + std::vector offset{-3}; + auto slr1 = make_fixed_point_scalar(-42, numeric::scale_type{-4}); + + fixed_point_column_wrapper stub{{-42, -42, -42, -42, -42, -42, -42, -42}, + numeric::scale_type{-1}}; + + EXPECT_THROW(test_groupby_shift_multi(key, table_view{{v1}}, offset, {*slr1}, table_view{{stub}}), + logic_error); +} + +TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType) +{ + fixed_width_column_wrapper key{2, 3, 4, 4, 3, 2, 2, 4}; + fixed_point_column_wrapper v1{{10, 10, 40, 40, 20, 20, 30, 40}, numeric::scale_type{-1}}; + + std::vector offset{-3}; + auto slr1 = make_fixed_point_scalar(-42, numeric::scale_type{-1}); + + fixed_point_column_wrapper stub{{-42, -42, -42, -42, -42, -42, -42, -42}, + numeric::scale_type{-1}}; + + EXPECT_THROW(test_groupby_shift_multi(key, table_view{{v1}}, offset, {*slr1}, table_view{{stub}}), + logic_error); } } // namespace test diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index f3bad855725..af09b27d916 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -5,15 +5,20 @@ from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp cimport bool +from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.aggregation cimport aggregation +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type, order, null_order, null_policy from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.utilities.host_span cimport host_span +# workaround for https://github.com/cython/cython/issues/3885 +ctypedef const scalar constscalar + cdef extern from "cudf/groupby.hpp" \ namespace "cudf::groupby" nogil: @@ -74,6 +79,15 @@ cdef extern from "cudf/groupby.hpp" \ const vector[aggregation_request]& requests, ) except + + pair[ + unique_ptr[table], + unique_ptr[table] + ] shift( + const table_view values, + const vector[size_type] offset, + const vector[reference_wrapper[constscalar]] fill_values + ) except + + groups get_groups() except + groups get_groups(table_view values) except + diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 1979ddf8f0c..17f6991c25d 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -22,8 +22,13 @@ from libcpp cimport bool from cudf._lib.column cimport Column from cudf._lib.table cimport Table +from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.scalar import as_device_scalar from cudf._lib.aggregation cimport Aggregation, make_aggregation +from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.scalar.scalar cimport scalar +from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table cimport table, table_view @@ -45,6 +50,8 @@ _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} +# workaround for https://github.com/cython/cython/issues/3885 +ctypedef const scalar constscalar cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj @@ -207,6 +214,40 @@ cdef class GroupBy: return Table(data=result_data, index=grouped_keys) + def shift(self, Table values, int periods, list fill_values): + cdef table_view view = values.view() + cdef size_type num_col = view.num_columns() + cdef vector[size_type] offsets = vector[size_type](num_col, periods) + + cdef vector[reference_wrapper[constscalar]] c_fill_values + cdef DeviceScalar d_slr + d_slrs = [] + c_fill_values.reserve(num_col) + for val, col in zip(fill_values, values._columns): + d_slr = as_device_scalar(val, dtype=col.dtype) + d_slrs.append(d_slr) + c_fill_values.push_back( + reference_wrapper[constscalar](d_slr.get_raw_ptr()[0]) + ) + + cdef pair[unique_ptr[table], unique_ptr[table]] c_result + + with nogil: + c_result = move( + self.c_obj.get()[0].shift(view, offsets, c_fill_values) + ) + + grouped_keys = Table.from_unique_ptr( + move(c_result.first), + column_names=self.keys._column_names + ) + + shifted = Table.from_unique_ptr( + move(c_result.second), column_names=values._column_names + ) + + return Table(data=shifted._data, index=grouped_keys) + def replace_nulls(self, Table values, object method): cdef table_view val_view = values.view() cdef pair[unique_ptr[table], unique_ptr[table]] c_result diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index c1060d5f505..6a298df32d6 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -8,7 +8,9 @@ import cudf from cudf._lib import groupby as libgroupby +from cudf._lib.table import Table from cudf.core.abc import Serializable +from cudf.utils.dtypes import is_list_like from cudf.utils.utils import GetAttrGetItemMixin, cached_property @@ -703,6 +705,61 @@ def cummax(self): """Get the column-wise cumulative maximum value in each group.""" return self.agg("cummax") + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + """ + Shift each group by ``periods`` positions. + + Parameters + ---------- + periods : int, default 1 + Number of periods to shift. + freq : str, unsupported + axis : 0, axis to shift + Shift direction. Only row-wise shift is supported + fill_value : scalar or list of scalars, optional + The scalar value to use for newly introduced missing values. Can be + specified with `None`, a single value or multiple values: + + - `None` (default): sets all indeterminable values to null. + - Single value: fill all shifted columns with this value. Should + match the data type of all columns. + - List of values: fill shifted columns with corresponding value in + the list. The length of the list should match the number of + columns shifted. Each value should match the data type of the + column to fill. + + Returns + ------- + Series or DataFrame + Object shifted within each group. + + Notes + ----- + Parameter ``freq`` is unsupported. + """ + + if freq is not None: + raise NotImplementedError("Parameter freq is unsupported.") + + if not axis == 0: + raise NotImplementedError("Only axis=0 is supported.") + + value_column_names = [ + x for x in self.obj._column_names if x not in self.grouping.names + ] + num_columns_to_shift = len(value_column_names) + if is_list_like(fill_value): + if not len(fill_value) == num_columns_to_shift: + raise ValueError( + "Mismatched number of columns and values to fill." + ) + else: + fill_value = [fill_value] * num_columns_to_shift + + value_columns = self.obj._data.select_by_label(value_column_names) + result = self._groupby.shift(Table(value_columns), periods, fill_value) + return self.obj.__class__._from_table(result) + class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): _PROTECTED_KEYS = frozenset(("obj",)) diff --git a/python/cudf/cudf/tests/dataset_generator.py b/python/cudf/cudf/tests/dataset_generator.py index d7adf175f3f..5e03068f818 100644 --- a/python/cudf/cudf/tests/dataset_generator.py +++ b/python/cudf/cudf/tests/dataset_generator.py @@ -282,7 +282,9 @@ def get_dataframe(parameters, use_threads): return tbl -def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)): +def rand_dataframe( + dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1), use_threads=True +): """ Generates a random table. @@ -300,6 +302,8 @@ def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)): seed : int Specifies the `seed` value to be utilized by all downstream random data generation APIs. + use_threads : bool + Indicates whether to use threads pools to build the columns Returns ------- @@ -457,7 +461,7 @@ def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)): df = get_dataframe( Parameters(num_rows=rows, column_parameters=column_params, seed=seed,), - use_threads=True, + use_threads=use_threads, ) return df diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 2430b0da5ef..e774bda4914 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -15,6 +15,7 @@ import cudf from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 +from cudf.tests.dataset_generator import rand_dataframe from cudf.tests.utils import ( DATETIME_TYPES, SIGNED_TYPES, @@ -1696,3 +1697,207 @@ def test_groupby_mix_agg_scan(): gb.agg(func[1:]) with pytest.raises(NotImplementedError, match=err_msg): gb.agg(func) + + +@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +@pytest.mark.parametrize("fill_value", [None, np.nan, 42]) +def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): + pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["x", "y"]).shift( + periods=n_shift, fill_value=fill_value + ) + got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) + + # Pandas returns shifted column in original row order. We set its index + # to be the key columns, so that `assert_groupby_results_equal` can sort + # rows by key columns to make sure cudf and pandas results matches. + expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas()) + assert_groupby_results_equal( + expected[["val", "val2"]], got[["val", "val2"]] + ) + + +@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +@pytest.mark.parametrize("fill_value", [None, 0, 42]) +def test_groupby_shift_row_mixed_numerics( + nelem, shift_perc, direction, fill_value +): + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) + got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) + + # Pandas returns shifted column in original row order. We set its index + # to be the key columns, so that `assert_groupby_results_equal` can sort + # rows by key columns to make sure cudf and pandas results matches. + expected.index = gdf["0"].to_pandas() + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + ) + + +# TODO: Shifting list columns is currently unsupported because we cannot +# construct a null list scalar in python. Support once it is added. +@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +def test_groupby_shift_row_mixed(nelem, shift_perc, direction): + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["0"]).shift(periods=n_shift) + got = gdf.groupby(["0"]).shift(periods=n_shift) + + # Pandas returns shifted column in original row order. We set its index + # to be the key columns, so that `assert_groupby_results_equal` can sort + # rows by key columns to make sure cudf and pandas results matches. + expected.index = gdf["0"].to_pandas() + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + ) + + +@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +@pytest.mark.parametrize( + "fill_value", + [ + [ + 42, + "fill", + np.datetime64(123, "ns"), + cudf.Scalar(456, dtype="timedelta64[ns]"), + ] + ], +) +def test_groupby_shift_row_mixed_fill( + nelem, shift_perc, direction, fill_value +): + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + # Pandas does not support specifing different fill_value by column, so we + # simulate it column by column + expected = pdf.copy() + for col, single_fill in zip(pdf.iloc[:, 1:], fill_value): + if isinstance(single_fill, cudf.Scalar): + single_fill = single_fill._host_value + expected[col] = ( + pdf[col] + .groupby(pdf["0"]) + .shift(periods=n_shift, fill_value=single_fill) + ) + + got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) + + # Pandas returns shifted column in original row order. We set its index + # to be the key columns, so that `assert_groupby_results_equal` can sort + # rows by key columns to make sure cudf and pandas results matches. + expected.index = gdf["0"].to_pandas() + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + ) + + +@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) +@pytest.mark.parametrize("fill_value", [None, 0, 42]) +def test_groupby_shift_row_zero_shift(nelem, fill_value): + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + ) + gdf = cudf.from_pandas(t.to_pandas()) + + expected = gdf + got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value) + + # Here, the result should be the same as input due to 0-shift, only the + # key orders are different. + expected = expected.set_index("0") + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + )