Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add groupby::replace_nulls(replace_policy) api #7118

Merged
merged 39 commits into from
May 24, 2021
Merged
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0d677f3
Initial, blocked by thrust 1374
isVoid Jan 11, 2021
a649994
Review comments: redesign to groupby::replace_null
isVoid Jan 12, 2021
0496739
refactor cleanup
isVoid Jan 12, 2021
72acfd3
Add test cases
isVoid Jan 12, 2021
60a2792
Test case fix
isVoid Jan 12, 2021
8115c51
Documentation; code rearrangement
isVoid Jan 14, 2021
604e92e
initial cython bindings
isVoid Jan 14, 2021
e0f8db7
Consolidate functor, improve docstring
isVoid Jan 20, 2021
dbe7482
Use generic iterator type for groupby_replace_nulls
isVoid Jan 20, 2021
207c0d7
Style
isVoid Jan 20, 2021
766af7d
Cython Style Fix
isVoid Jan 20, 2021
91d2419
Add new header to conda recipe
isVoid Jan 20, 2021
1da6c6a
Rev: remove const qual; Early size check
isVoid Jan 25, 2021
a2ded96
Use default rmm_memory_resource for scratch space
isVoid Jan 28, 2021
0aad4cf
Undo stale detail/groupby.hpp change
isVoid Jan 28, 2021
7bed6e7
cpp refactor
isVoid Jan 28, 2021
6cf9166
cython side refactor
isVoid Jan 28, 2021
9607c9d
update missing docstrings
isVoid Jan 29, 2021
97549e9
Merge branch 'branch-0.19' of https://github.com/rapidsai/cudf into i…
isVoid Feb 5, 2021
cec5427
Update interface after discussion
isVoid Feb 5, 2021
0d49c83
Merge branch 'branch-0.19' of https://github.com/rapidsai/cudf into i…
isVoid Feb 9, 2021
85ed523
Update unit test, add test to CMakeList
isVoid Feb 9, 2021
b941c14
Use vector of results
isVoid Feb 9, 2021
a83a185
Removed scan_agg_result
isVoid Feb 10, 2021
e886d5b
Update docstring
isVoid Feb 10, 2021
40a2a4a
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into i…
isVoid May 4, 2021
3a32f7b
initial refactor, signature and docs
isVoid May 4, 2021
c8de3d4
Refactor existing code, accepting multi col
isVoid May 5, 2021
2c784b7
cython refactor
isVoid May 5, 2021
177b9d0
Lists and structs test
isVoid May 5, 2021
66b18f1
Apply suggestions from code review
isVoid May 5, 2021
8f62a55
Update groupby.pxd
isVoid May 5, 2021
a2ea4e5
Update group_replace_nulls.hpp
isVoid May 5, 2021
d86bf91
Update nulls.cuh
isVoid May 5, 2021
7ec84f7
Update to device_span and move def to cu file
isVoid May 6, 2021
7f6f4b7
Merge branch 'branch-0.20' of https://github.com/rapidsai/cudf into i…
isVoid May 6, 2021
5e58d5a
Renames
isVoid May 6, 2021
5780d78
Merge branch 'branch-21.06' of https://github.com/rapidsai/cudf into …
isVoid May 20, 2021
7a90f52
Update replace_nulls_tests.cpp
isVoid May 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Initial, blocked by thrust 1374
isVoid committed Jan 11, 2021
commit 0d677f3e3a56b06c35c7a1c9ccf361d0fbac2c68
3 changes: 3 additions & 0 deletions cpp/include/cudf/groupby.hpp
Original file line number Diff line number Diff line change
@@ -195,6 +195,9 @@ class groupby {
groups get_groups(cudf::table_view values = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> get_group_label(
jrhemstad marked this conversation as resolved.
Show resolved Hide resolved
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

private:
table_view _keys; ///< Keys that determine grouping
null_policy _include_null_keys{null_policy::EXCLUDE}; ///< Include rows in keys
18 changes: 18 additions & 0 deletions cpp/include/cudf/replace.hpp
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@
#pragma once

#include <cudf/types.hpp>
#include <cudf/groupby.hpp>
#include <memory>

namespace cudf {
@@ -86,6 +87,23 @@ std::unique_ptr<column> replace_nulls(
replace_policy const& replace_policy,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief
*
*
* @param[in] key
* @param[in] input A column whose null values will be replaced.
* @param[in] replace_policy Specify the position of replacement values relative to null values.
* @param[in] mr Device memory resource used to allocate device memory of the returned column.
*
* @returns
*/
std::unique_ptr<column> replace_nulls(
column_view const& key,
column_view const& input,
replace_policy const& replace_policy,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Replaces all NaN values in a column with corresponding values from another column
*
11 changes: 11 additions & 0 deletions cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
@@ -183,6 +183,17 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
}
}

std::unique_ptr<cudf::column> groupby::get_group_label(rmm::mr::device_memory_resource* mr)
{
auto d_group_labels = helper().group_labels();
auto group_label = cudf::make_numeric_column(cudf::data_type(type_to_id<cudf::size_type>()), d_group_labels.size(),
mask_state::UNALLOCATED, rmm::cuda_stream_default, mr);
auto group_label_mview = group_label->mutable_view();

thrust::copy(d_group_labels.begin(), d_group_labels.end(), group_label_mview.begin<cudf::size_type>());
return group_label;
}

// Get the sort helper object
detail::sort::sort_groupby_helper& groupby::helper()
{
74 changes: 73 additions & 1 deletion cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
@@ -39,11 +39,14 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <thrust/functional.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/reverse_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/scan.h>

namespace { // anonymous

@@ -387,7 +390,7 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
auto valid_it = cudf::detail::make_validity_iterator(*device_in);
auto in_begin = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));

rmm::device_vector<cudf::size_type> gather_map(input.size());
rmm::device_uvector<cudf::size_type> gather_map(input.size(), stream, mr);
auto gm_begin = thrust::make_zip_iterator(
thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));

@@ -410,6 +413,50 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
return std::move(output->release()[0]);
}

/**
* @brief Function used by groupby replace_nulls policy
*/

std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const& key,
cudf::column_view const& input,
cudf::replace_policy const& replace_policy,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto key_begin = key.begin<cudf::size_type>();

auto device_in = cudf::column_device_view::create(input);
auto index = thrust::make_counting_iterator<cudf::size_type>(0);
auto valid_it = cudf::detail::make_validity_iterator(*device_in);
auto in_begin = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));

rmm::device_uvector<cudf::size_type> gather_map(input.size(), stream, mr);
// rmm::device_vector<cudf::valid_type> valid_holder(input.size());
auto gm_begin = thrust::make_zip_iterator(
thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
// auto gm_begin = thrust::make_zip_iterator(
// thrust::make_tuple(gather_map.begin(), valid_holder.begin()));

auto func = replace_policy_functor();
thrust::equal_to<cudf::size_type> eq;
// thrust::plus<cudf::size_type> binop;
if (replace_policy == cudf::replace_policy::PRECEDING) {
thrust::inclusive_scan_by_key(
rmm::exec_policy(stream), key_begin, key_begin+key.size(), in_begin, gm_begin, eq, func);
}
// else {
// auto key_rbegin = thrust::make_reverse_iterator(key_begin + key.size());
// auto in_rbegin = thrust::make_reverse_iterator(in_begin + input.size());
// auto gm_rbegin = thrust::make_reverse_iterator(gm_begin + gather_map.size());
// thrust::inclusive_scan_by_key(
// rmm::exec_policy(stream), key_rbegin, key_rbegin+key.size(), in_rbegin, gm_rbegin, eq, func);
// }

auto output = cudf::detail::gather(cudf::table_view({input}), gather_map.begin(), gather_map.end(), cudf::out_of_bounds_policy::DONT_CHECK);

return std::move(output->release()[0]);
}

} // end anonymous namespace

namespace cudf {
@@ -457,6 +504,21 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
return replace_nulls_policy_impl(input, replace_policy, stream, mr);
}

std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& key,
cudf::column_view const& input,
cudf::replace_policy const& replace_policy,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) { return cudf::empty_like(input); }

CUDF_EXPECTS(input.size() == key.size(), "Key-value size mismatch.");

if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }

return replace_nulls_policy_impl(key, input, replace_policy, stream, mr);
}

} // namespace detail

std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
@@ -483,4 +545,14 @@ std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
return cudf::detail::replace_nulls(input, replace_policy, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> replace_nulls(
column_view const& key,
column_view const& input,
replace_policy const& replace_policy,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return cudf::detail::replace_nulls(key, input, replace_policy, rmm::cuda_stream_default, mr);
}

} // namespace cudf
41 changes: 41 additions & 0 deletions cpp/tests/replace/replace_nulls_tests.cpp
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@
*/

#include <cudf/replace.hpp>
#include <cudf/groupby.hpp>

#include <tests/groupby/groupby_test_util.hpp>

@@ -437,6 +438,46 @@ TYPED_TEST(ReplaceNullsPolicyTest, FollowingFillTrailingNulls)
cudf::replace_policy::FOLLOWING);
}

// template <typename T>
// struct ReplaceNullsGroupbyTest : public cudf::test::BaseFixture {
// };

// TYPED_TEST_CASE(ReplaceNullsGroupbyTest, test_types);

// template <typename T>
// void TestReplaceNullsGroupby(cudf::test::fixed_width_column_wrapper<T> key,
// cudf::test::fixed_width_column_wrapper<int32_t> input,
// cudf::test::fixed_width_column_wrapper<T> expected,
// cudf::replace_policy policy)
// {
// cudf::groupby::groupby gb_obj(table_view({key}), false, false, {}, {});
// auto labels = gb_obj.get_group_label();

// auto result = cudf::replace_nulls(*labels, input, policy);
// CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
// }

// TYPED_TEST(ReplaceNullsGroupbyTest, PrecedingFill)
// {
// using K = TypeParam;
// using V = int32_t;

// std::vector<K> key =
// cudf::test::make_type_param_vector<K>({0, 1, 0, 1, 0, 1});
// std::vector<V> val =
// cudf::test::make_type_param_vector<V>({42, 7, 24, 10, 1, 1000});
// std::vector<cudf::valid_type> mask =
// cudf::test::make_type_param_vector<cudf::valid_type>({1, 1, 1, 0, 0, 0});
// std::vector<K> expect_col =
// cudf::test::make_type_param_vector<K>({42, 7, 24, 7, 24, 7});

// TestReplaceNullsGroupby(
// cudf::test::fixed_width_column_wrapper<K>(key.begin(), key.end(), mask.begin()),
// cudf::test::fixed_width_column_wrapper<K>(
// expect_col.begin(), expect_col.end(), cudf::test::all_valid()),
// cudf::replace_policy::PRECEDING);
// }

struct ReplaceDictionaryTest : public cudf::test::BaseFixture {
};