Skip to content

Commit

Permalink
Merge branch 'branch-22.02' into dictionary-copy-if-else
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Dec 14, 2021
2 parents c32f625 + b3b299a commit 7d89adb
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 107 deletions.
8 changes: 6 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,12 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
# ##################################################################################################
# * groupby benchmark -----------------------------------------------------------------------------
ConfigureBench(
GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu
groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu
GROUPBY_BENCH
groupby/group_sum_benchmark.cu
groupby/group_nth_benchmark.cu
groupby/group_shift_benchmark.cu
groupby/group_struct_benchmark.cu
groupby/group_no_requests_benchmark.cu
)

# ##################################################################################################
Expand Down
115 changes: 115 additions & 0 deletions cpp/benchmarks/groupby/group_no_requests_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/copying.hpp>
#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/groupby.hpp>
#include <cudf/sorting.hpp>
#include <cudf/table/table.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <fixture/benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <memory>
#include <random>

class Groupby : public cudf::benchmark {
};

// TODO: put it in a struct so `uniform` can be remade with different min, max
template <typename T>
T random_int(T min, T max)
{
static unsigned seed = 13377331;
static std::mt19937 engine{seed};
static std::uniform_int_distribution<T> uniform{min, max};

return uniform(engine);
}

void BM_basic_no_requests(benchmark::State& state)
{
using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;

const cudf::size_type column_size{(cudf::size_type)state.range(0)};

auto data_it = cudf::detail::make_counting_transform_iterator(
0, [=](cudf::size_type row) { return random_int(0, 100); });

wrapper keys(data_it, data_it + column_size);
wrapper vals(data_it, data_it + column_size);

std::vector<cudf::groupby::aggregation_request> requests;

for (auto _ : state) {
cuda_event_timer timer(state, true);
cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
auto result = gb_obj.aggregate(requests);
}
}

BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state)
{
BM_basic_no_requests(state);
}

BENCHMARK_REGISTER_F(Groupby, BasicNoRequest)
->UseManualTime()
->Unit(benchmark::kMillisecond)
->Arg(10000)
->Arg(1000000)
->Arg(10000000)
->Arg(100000000);

void BM_pre_sorted_no_requests(benchmark::State& state)
{
using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;

const cudf::size_type column_size{(cudf::size_type)state.range(0)};

auto data_it = cudf::detail::make_counting_transform_iterator(
0, [=](cudf::size_type row) { return random_int(0, 100); });
auto valid_it = cudf::detail::make_counting_transform_iterator(
0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });

wrapper keys(data_it, data_it + column_size);
wrapper vals(data_it, data_it + column_size, valid_it);

auto keys_table = cudf::table_view({keys});
auto sort_order = cudf::sorted_order(keys_table);
auto sorted_keys = cudf::gather(keys_table, *sort_order);
// No need to sort values using sort_order because they were generated randomly

std::vector<cudf::groupby::aggregation_request> requests;

for (auto _ : state) {
cuda_event_timer timer(state, true);
cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
auto result = gb_obj.aggregate(requests);
}
}

BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state)
{
BM_pre_sorted_no_requests(state);
}

BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests)
->UseManualTime()
->Unit(benchmark::kMillisecond)
->Arg(1000000)
->Arg(10000000)
->Arg(100000000);
27 changes: 10 additions & 17 deletions cpp/src/groupby/hash/groupby.cu
Original file line number Diff line number Diff line change
Expand Up @@ -636,23 +636,16 @@ std::unique_ptr<table> groupby(table_view const& keys,
*/
bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
{
auto const all_hash_aggregations =
std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
return cudf::has_atomic_support(r.values.type()) and
std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
return is_hash_aggregation(a->kind);
});
});

// Currently, structs are not supported in any of hash-based aggregations.
// Therefore, if any request contains structs then we must fallback to sort-based aggregations.
// TODO: Support structs in hash-based aggregations.
auto const has_struct =
std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
return r.values.type().id() == type_id::STRUCT;
});

return all_hash_aggregations && !has_struct;
return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
// Currently, structs are not supported in any of hash-based aggregations.
// Therefore, if any request contains structs then we must fallback to sort-based aggregations.
// TODO: Support structs in hash-based aggregations.
return not(r.values.type().id() == type_id::STRUCT) and
cudf::has_atomic_support(r.values.type()) and
std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
return is_hash_aggregation(a->kind);
});
});
}

// Hash-based groupby
Expand Down
144 changes: 76 additions & 68 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles):
return ["{0}%".format(int(x * 100)) for x in percentiles]

def _format_stats_values(stats_data):
return list(map(lambda x: round(x, 6), stats_data))
return map(lambda x: round(x, 6), stats_data)

def _describe_numeric(self):
# mimicking pandas
index = (
["count", "mean", "std", "min"]
+ _format_percentile_names(percentiles)
+ ["max"]
)
data = (
[self.count(), self.mean(), self.std(), self.min()]
+ self.quantile(percentiles).to_numpy(na_value=np.nan).tolist()
+ [self.max()]
)
data = _format_stats_values(data)
data = {
"count": self.count(),
"mean": self.mean(),
"std": self.std(),
"min": self.min(),
**dict(
zip(
_format_percentile_names(percentiles),
self.quantile(percentiles)
.to_numpy(na_value=np.nan)
.tolist(),
)
),
"max": self.max(),
}

return Series(
data=data, index=index, nan_as_null=False, name=self.name,
data=_format_stats_values(data.values()),
index=data.keys(),
nan_as_null=False,
name=self.name,
)

def _describe_timedelta(self):
# mimicking pandas
index = (
["count", "mean", "std", "min"]
+ _format_percentile_names(percentiles)
+ ["max"]
)

data = (
[
str(self.count()),
str(self.mean()),
str(self.std()),
str(pd.Timedelta(self.min())),
]
+ self.quantile(percentiles)
.astype("str")
.to_numpy(na_value=None)
.tolist()
+ [str(pd.Timedelta(self.max()))]
)
data = {
"count": str(self.count()),
"mean": str(self.mean()),
"std": str(self.std()),
"min": str(pd.Timedelta(self.min())),
**dict(
zip(
_format_percentile_names(percentiles),
self.quantile(percentiles)
.astype("str")
.to_numpy(na_value=np.nan)
.tolist(),
)
),
"max": str(pd.Timedelta(self.max())),
}

return Series(
data=data,
index=index,
data=data.values(),
index=data.keys(),
dtype="str",
nan_as_null=False,
name=self.name,
Expand All @@ -3365,51 +3369,55 @@ def _describe_timedelta(self):
def _describe_categorical(self):
# blocked by StringColumn/DatetimeColumn support for
# value_counts/unique
index = ["count", "unique", "top", "freq"]
val_counts = self.value_counts(ascending=False)
data = [self.count(), self.unique().size]

if data[1] > 0:
top, freq = val_counts.index[0], val_counts.iloc[0]
data += [str(top), freq]
# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
data += [None, None]
data = {
"count": self.count(),
"unique": len(self.unique()),
"top": None,
"freq": None,
}
if data["count"] > 0:
# In case there's a tie, break the tie by sorting the index
# and take the top.
val_counts = self.value_counts(ascending=False)
tied_val_counts = val_counts[
val_counts == val_counts.iloc[0]
].sort_index()
data.update(
{
"top": tied_val_counts.index[0],
"freq": tied_val_counts.iloc[0],
}
)

return Series(
data=data,
data=data.values(),
dtype="str",
index=index,
index=data.keys(),
nan_as_null=False,
name=self.name,
)

def _describe_timestamp(self):

index = (
["count", "mean", "min"]
+ _format_percentile_names(percentiles)
+ ["max"]
)

data = (
[
str(self.count()),
str(self.mean().to_numpy().astype("datetime64[ns]")),
str(pd.Timestamp(self.min().astype("datetime64[ns]"))),
]
+ self.quantile(percentiles)
.astype("str")
.to_numpy(na_value=None)
.tolist()
+ [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))]
)
data = {
"count": str(self.count()),
"mean": str(pd.Timestamp(self.mean())),
"min": str(pd.Timestamp(self.min())),
**dict(
zip(
_format_percentile_names(percentiles),
self.quantile(percentiles)
.astype(self.dtype)
.astype("str")
.to_numpy(na_value=np.nan),
)
),
"max": str(pd.Timestamp((self.max()))),
}

return Series(
data=data,
data=data.values(),
dtype="str",
index=index,
index=data.keys(),
nan_as_null=False,
name=self.name,
)
Expand Down
Loading

0 comments on commit 7d89adb

Please sign in to comment.