Skip to content

Commit

Permalink
Groupby.shift c++ API refactor and python binding (#8131)
Browse files Browse the repository at this point in the history
Closes #7183 , follow up of #7910 

This PR:
- refactors existing libcudf `groupby::shift` API, which only takes a single column, to accept multiple columns.
- adds cython and python bindings for `groupby.shift`. Example python usage:

```
df = cudf.DataFrame({"a":[1,2,1,2,2], "b":["x", "y", "z", "42", "7"]})
>>> df.groupby("a").shift(1)
      b
a      
1  <NA>
1     x
2  <NA>
2     y
2    42
```

Minor refactors:
- adds `use_thread` parameter to `dataset_generator.rand_dataframe` to expose thread pool config.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Ashwin Srinath (https://github.com/shwina)
  - Keith Kraus (https://github.com/kkraus14)
  - Karthikeyan (https://github.com/karthikeyann)
  - Christopher Harris (https://github.com/cwharris)

URL: #8131
  • Loading branch information
isVoid authored May 26, 2021
1 parent 24e05a0 commit cd7fe6f
Show file tree
Hide file tree
Showing 9 changed files with 545 additions and 77 deletions.
6 changes: 3 additions & 3 deletions cpp/benchmarks/groupby/group_shift_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,16 @@ void BM_group_shift(benchmark::State& state)

cudf::groupby::groupby gb_obj(cudf::table_view({keys}));

cudf::size_type offset =
static_cast<cudf::size_type>(column_size / float(num_groups) * 0.5); // forward shift half way
std::vector<cudf::size_type> offsets{
static_cast<cudf::size_type>(column_size / float(num_groups) * 0.5)}; // forward shift half way
// null fill value
auto fill_value = cudf::make_default_constructed_scalar(cudf::data_type(cudf::type_id::INT64));
// non null fill value
// auto fill_value = cudf::make_fixed_width_scalar(static_cast<int64_t>(42));

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = gb_obj.shift(vals, offset, *fill_value);
auto result = gb_obj.shift(cudf::table_view{{vals}}, offsets, {*fill_value});
}
}

Expand Down
64 changes: 42 additions & 22 deletions cpp/include/cudf/groupby.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,37 +228,57 @@ class groupby {
/**
* @brief Performs grouped shifts for specified values.
*
* For each group, `i`th element is determined by the `i - offset`th element
* of the group. If `i - offset < 0 or >= group_size`, the value is determined by
* @p fill_value.
* In `j`th column, for each group, `i`th element is determined by the `i - offsets[j]`th
* element of the group. If `i - offsets[j] < 0 or >= group_size`, the value is determined by
* @p fill_values[j].
*
* @note The first returned table stores the keys passed to the groupby object. Row `i` of the key
* table corresponds to the group labels of row `i` in the shifted columns. The key order in
* each group matches the input order. The order of each group is arbitrary. The group order
* in successive calls to `groupby::shifts` may be different.
*
* Example:
* @code{.pseudo}
* keys: {1 1 1 1 2 2 2}
* values: {3 1 4 7 9 2 5}
* offset: 2
* fill_value: @
* result: {@ @ 3 1 @ @ 9}
* keys: {1 4 1 3 4 4 1}
* {1 2 1 3 2 2 1}
* values: {3 9 1 4 2 5 7}
* {"a" "c" "bb" "ee" "z" "x" "d"}
* offset: {2, -1}
* fill_value: {@, @}
* result (group order maybe different):
* keys: {3 1 1 1 4 4 4}
* {3 1 1 1 2 2 2}
* values: {@ @ @ 3 @ @ 9}
* {@ "bb" "d" @ "z" "x" @}
*
* -------------------------------------------------
* keys: {1 1 1 1 2 2 2}
* values: {3 1 4 7 9 2 5}
* offset: -2
* fill_value: -1
* result: {4 7 -1 -1 5 -1 -1}
* keys: {1 4 1 3 4 4 1}
* {1 2 1 3 2 2 1}
* values: {3 9 1 4 2 5 7}
* {"a" "c" "bb" "ee" "z" "x" "d"}
* offset: {-2, 1}
* fill_value: {-1, "42"}
* result (group order maybe different):
* keys: {3 1 1 1 4 4 4}
* {3 1 1 1 2 2 2}
* values: {-1 7 -1 -1 5 -1 -1}
* {"42" "42" "a" "bb" "42" "c" "z"}
*
* @endcode
*
* @param values Column to be shifted
* @param offset The off set by which to shift the input
* @param fill_value Fill value for indeterminable outputs
* @param values Table whose columns to be shifted
* @param offsets The offsets by which to shift the input
* @param fill_values Fill values for indeterminable outputs
* @param mr Device memory resource used to allocate the returned table and columns' device memory
* @return Pair containing the table with each group's key and the column shifted
* @return Pair containing the tables with each group's key and the columns shifted
*
* @throws cudf::logic_error if @p fill_value dtype does not match @p input dtype
* @throws cudf::logic_error if @p fill_value[i] dtype does not match @p values[i] dtype for
* `i`th column
*/
std::pair<std::unique_ptr<table>, std::unique_ptr<column>> shift(
column_view const& values,
size_type offset,
scalar const& fill_value,
std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
table_view const& values,
host_span<size_type const> offsets,
std::vector<std::reference_wrapper<const scalar>> const& fill_values,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
38 changes: 26 additions & 12 deletions cpp/src/groupby/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <rmm/cuda_stream_view.hpp>

#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>

#include <memory>
#include <utility>
Expand Down Expand Up @@ -262,23 +263,36 @@ detail::sort::sort_groupby_helper& groupby::helper()
return *_helper;
};

std::pair<std::unique_ptr<table>, std::unique_ptr<column>> groupby::shift(
column_view const& values,
size_type offset,
scalar const& fill_value,
std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
table_view const& values,
host_span<size_type const> offsets,
std::vector<std::reference_wrapper<const scalar>> const& fill_values,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(values.type() == fill_value.type(),
"values and fill_value should have the same type.");
CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
"Mismatch number of fill_values and columns.");
CUDF_EXPECTS(
std::all_of(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(values.num_columns()),
[&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
"values and fill_value should have the same type.");

auto stream = rmm::cuda_stream_default;
auto grouped_values = helper().grouped_values(values, stream);
auto stream = rmm::cuda_stream_default;
std::vector<std::unique_ptr<column>> results;
auto const& group_offsets = helper().group_offsets(stream);
std::transform(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(values.num_columns()),
std::back_inserter(results),
[&](size_type i) {
auto grouped_values = helper().grouped_values(values.column(i), stream);
return cudf::detail::segmented_shift(
grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
});

return std::make_pair(
helper().sorted_keys(stream, mr),
std::move(cudf::detail::segmented_shift(
grouped_values->view(), helper().group_offsets(stream), offset, fill_value, stream, mr)));
return std::make_pair(helper().sorted_keys(stream, mr),
std::make_unique<cudf::table>(std::move(results)));
}

} // namespace groupby
Expand Down
Loading

0 comments on commit cd7fe6f

Please sign in to comment.