Skip to content

Commit

Permalink
Merge pull request #4398 from devavret/bug-groupby-string-min-max
Browse files Browse the repository at this point in the history
[REVIEW] Fix failure in groupby MIN/MAX when strings in some groups are empty
  • Loading branch information
Keith Kraus authored Mar 16, 2020
2 parents 0242d70 + 76bc0e8 commit 9c429e9
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@
- PR #4358 Fix strings::concat where narep is an empty string
- PR #4369 Fix race condition in gpuinflate
- PR #4390 Disable ScatterValid and ScatterNull legacy tests
- PR #4398 Fixes the failure in groupby in MIN/MAX on strings when some groups are empty
- PR #4406 Fix sorted merge issue with null values and ascending=False
- PR #4445 Fix string issue for parquet reader and support `keep_index` for `scatter_to_tables`
- PR #4423 Tighten up Dask serialization checks
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/replace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ std::unique_ptr<column> replace_nulls(column_view const& input,
* @returns Copy of `input` with null values replaced by `replacement`.
*/
std::unique_ptr<column> replace_nulls(column_view const& input,
scalar const* replacement,
scalar const& replacement,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
cudaStream_t stream = 0);

Expand Down
18 changes: 14 additions & 4 deletions cpp/src/groupby/hash/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/groupby.hpp>
#include <cudf/detail/groupby.hpp>
#include <cudf/detail/gather.cuh>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/replace.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/table/table_device_view.cuh>
Expand Down Expand Up @@ -165,10 +167,18 @@ void sparse_to_dense_results(
auto transformed_result =
[&col, to_dense_agg_result, mr, stream]
(auto const& agg_kind) {
auto tranformed_agg = std::make_unique<aggregation>(agg_kind);
auto argmax_result = to_dense_agg_result(tranformed_agg);
auto transformed_result = experimental::detail::gather(
table_view({col}), *argmax_result, false, false, false, mr, stream);
auto transformed_agg = std::make_unique<aggregation>(agg_kind);
auto arg_result = to_dense_agg_result(transformed_agg);
// We make a view of ARG(MIN/MAX) result without a null mask and gather
// using this map. The values in data buffer of ARG(MIN/MAX) result
// corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
// which is an out of bounds index value (-1) and causes the gathered
// value to be null.
column_view null_removed_map(data_type(type_to_id<size_type>()),
arg_result->size(),
static_cast<void const*>(arg_result->view().template data<size_type>()));
auto transformed_result = experimental::detail::gather(table_view({col}),
null_removed_map, false, arg_result->nullable(), false, mr, stream);
return std::move(transformed_result->release()[0]);
};

Expand Down
14 changes: 14 additions & 0 deletions cpp/tests/groupby/sort/group_max_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,19 @@ TEST_F(groupby_max_string_test, basic)
test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
}

TEST_F(groupby_max_string_test, zero_valid_values)
{
using K = int32_t;

fixed_width_column_wrapper<K> keys { 1, 1, 1};
strings_column_wrapper vals ( { "año", "bit", "₹1"}, all_null() );

fixed_width_column_wrapper<K> expect_keys { 1 };
strings_column_wrapper expect_vals({ "" }, all_null());

auto agg = cudf::experimental::make_max_aggregation();
test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
}

} // namespace test
} // namespace cudf
14 changes: 14 additions & 0 deletions cpp/tests/groupby/sort/group_min_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,19 @@ TEST_F(groupby_min_string_test, basic)
test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
}

TEST_F(groupby_min_string_test, zero_valid_values)
{
using K = int32_t;

fixed_width_column_wrapper<K> keys { 1, 1, 1};
strings_column_wrapper vals ( { "año", "bit", "₹1"}, all_null() );

fixed_width_column_wrapper<K> expect_keys { 1 };
strings_column_wrapper expect_vals({ "" }, all_null());

auto agg = cudf::experimental::make_min_aggregation();
test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
}

} // namespace test
} // namespace cudf

0 comments on commit 9c429e9

Please sign in to comment.