From e0c394273c355f3b706c9aaca6c0e554906ec379 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 10 Mar 2020 23:12:44 +0530 Subject: [PATCH 1/6] Fix issue with MIN/MAX strings when val have nulls --- cpp/include/cudf/detail/replace.hpp | 2 +- cpp/src/groupby/hash/groupby.cu | 20 ++++++++++++++++---- cpp/tests/groupby/sort/group_max_test.cu | 14 ++++++++++++++ cpp/tests/groupby/sort/group_min_test.cu | 14 ++++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp index 1818526ff53..413eb1f90ef 100644 --- a/cpp/include/cudf/detail/replace.hpp +++ b/cpp/include/cudf/detail/replace.hpp @@ -57,7 +57,7 @@ std::unique_ptr replace_nulls(column_view const& input, * @returns Copy of `input` with null values replaced by `replacement`. */ std::unique_ptr replace_nulls(column_view const& input, - scalar const* replacement, + scalar const& replacement, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), cudaStream_t stream = 0); diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index cf281bc76d7..06a206f79b7 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -23,10 +23,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -166,10 +168,20 @@ void sparse_to_dense_results( [&col, to_dense_agg_result, mr, stream] (auto const& agg_kind) { auto tranformed_agg = std::make_unique(agg_kind); - auto argmax_result = to_dense_agg_result(tranformed_agg); - auto transformed_result = experimental::detail::gather( - table_view({col}), *argmax_result, false, false, false, mr, stream); - return std::move(transformed_result->release()[0]); + auto arg_result = to_dense_agg_result(tranformed_agg); + if (arg_result->has_nulls()) { + auto replacement = numeric_scalar(-1, true, stream); + auto null_replaced_map = cudf::detail::replace_nulls( + *arg_result, replacement, rmm::mr::get_default_resource(), stream); + auto transformed_result = experimental::detail::gather( + table_view({col}), *null_replaced_map, false, true, false, mr, stream); + return std::move(transformed_result->release()[0]); + } + else { + auto transformed_result = experimental::detail::gather( + table_view({col}), *arg_result, false, false, false, mr, stream); + return std::move(transformed_result->release()[0]); + } }; for (auto &&agg : agg_v) { diff --git a/cpp/tests/groupby/sort/group_max_test.cu b/cpp/tests/groupby/sort/group_max_test.cu index 77937050422..b540d00bdea 100644 --- a/cpp/tests/groupby/sort/group_max_test.cu +++ b/cpp/tests/groupby/sort/group_max_test.cu @@ -133,5 +133,19 @@ TEST_F(groupby_max_string_test, basic) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } +TEST_F(groupby_max_string_test, zero_valid_values) +{ + using K = int32_t; + + fixed_width_column_wrapper keys { 1, 1, 1}; + strings_column_wrapper vals ( { "año", "bit", "₹1"}, all_null() ); + + fixed_width_column_wrapper expect_keys { 1 }; + strings_column_wrapper expect_vals({ "" }, all_null()); + + auto agg = cudf::experimental::make_max_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/groupby/sort/group_min_test.cu b/cpp/tests/groupby/sort/group_min_test.cu index abe5f6f006f..c2ae89cb971 100644 --- a/cpp/tests/groupby/sort/group_min_test.cu +++ b/cpp/tests/groupby/sort/group_min_test.cu @@ -133,5 +133,19 @@ TEST_F(groupby_min_string_test, basic) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); } +TEST_F(groupby_min_string_test, zero_valid_values) +{ + using K = int32_t; + + fixed_width_column_wrapper keys { 1, 1, 1}; + strings_column_wrapper vals ( { "año", "bit", "₹1"}, all_null() ); + + fixed_width_column_wrapper expect_keys { 1 }; + strings_column_wrapper expect_vals({ "" }, all_null()); + + auto agg = cudf::experimental::make_min_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); +} + } // namespace test } // namespace cudf From 28f92451e6d5e8a83360a7c742f14def492cae20 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 10 Mar 2020 23:23:39 +0530 Subject: [PATCH 2/6] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c660d83500b..0eba669fe2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -221,7 +221,7 @@ - PR #4358 Fix strings::concat where narep is an empty string - PR #4369 Fix race condition in gpuinflate - PR #4390 Disable ScatterValid and ScatterNull legacy tests - +- PR #4398 Fixes the bug in groupby in MIN/MAX on strings when strings some groups are empty # cuDF 0.12.0 (04 Feb 2020) From e21d82106823809bb8d94846e4c4a13def4c4f1b Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 12 Mar 2020 04:29:45 +0530 Subject: [PATCH 3/6] Remove extra replace_nulls operation --- cpp/src/groupby/hash/groupby.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 06a206f79b7..a039000ef2a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -169,12 +169,12 @@ void sparse_to_dense_results( (auto const& agg_kind) { auto tranformed_agg = std::make_unique(agg_kind); auto arg_result = to_dense_agg_result(tranformed_agg); - if (arg_result->has_nulls()) { - auto replacement = numeric_scalar(-1, true, stream); - auto null_replaced_map = cudf::detail::replace_nulls( - *arg_result, replacement, rmm::mr::get_default_resource(), stream); + if (arg_result->nullable()) { + column_view null_removed_map(data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data())); auto transformed_result = experimental::detail::gather( - table_view({col}), *null_replaced_map, false, true, false, mr, stream); + table_view({col}), null_removed_map, false, true, false, mr, stream); return std::move(transformed_result->release()[0]); } else { From f33806c812abfe3b2e0365796ec4a9d8326e7ddf Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 13 Mar 2020 20:27:24 +0530 Subject: [PATCH 4/6] Chnagelog typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdc3e7eb71d..2236e701ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -234,7 +234,7 @@ - PR #4358 Fix strings::concat where narep is an empty string - PR #4369 Fix race condition in gpuinflate - PR #4390 Disable ScatterValid and ScatterNull legacy tests -- PR #4398 Fixes the bug in groupby in MIN/MAX on strings when strings some groups are empty +- PR #4398 Fixes the failure in groupby in MIN/MAX on strings when some groups are empty - PR #4406 Fix sorted merge issue with null values and ascending=False - PR #4423 Tighten up Dask serialization checks - PR #4434 Fix join_strings logic with all-null strings and non-null narep From f74adf84709922f684f2ce644f9da8b17bc157fe Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 16 Mar 2020 16:39:06 +0530 Subject: [PATCH 5/6] Document dropping mask of ARGMIN/MAX gather map --- cpp/src/groupby/hash/groupby.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index a039000ef2a..ed4daec7f68 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -170,6 +170,11 @@ void sparse_to_dense_results( auto tranformed_agg = std::make_unique(agg_kind); auto arg_result = to_dense_agg_result(tranformed_agg); if (arg_result->nullable()) { + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. column_view null_removed_map(data_type(type_to_id()), arg_result->size(), static_cast(arg_result->view().template data())); From 76bc0e823a8abbf224b2a65862133dcd56298fc4 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Mon, 16 Mar 2020 23:35:29 +0530 Subject: [PATCH 6/6] Review code cleanup requested by karthikeyan https://github.com/rapidsai/cudf/pull/4456#discussion_r393186736 --- cpp/src/groupby/hash/groupby.cu | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index ed4daec7f68..66e8138d45b 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -167,26 +167,19 @@ void sparse_to_dense_results( auto transformed_result = [&col, to_dense_agg_result, mr, stream] (auto const& agg_kind) { - auto tranformed_agg = std::make_unique(agg_kind); - auto arg_result = to_dense_agg_result(tranformed_agg); - if (arg_result->nullable()) { - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map(data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data())); - auto transformed_result = experimental::detail::gather( - table_view({col}), null_removed_map, false, true, false, mr, stream); - return std::move(transformed_result->release()[0]); - } - else { - auto transformed_result = experimental::detail::gather( - table_view({col}), *arg_result, false, false, false, mr, stream); - return std::move(transformed_result->release()[0]); - } + auto transformed_agg = std::make_unique(agg_kind); + auto arg_result = to_dense_agg_result(transformed_agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map(data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data())); + auto transformed_result = experimental::detail::gather(table_view({col}), + null_removed_map, false, arg_result->nullable(), false, mr, stream); + return std::move(transformed_result->release()[0]); }; for (auto &&agg : agg_v) {