Merge branch 'branch-22.02' into dictionary-copy-if-else

rapidsai · Dec 14, 2021 · 7d89adb · 7d89adb
2 parents c32f625 + b3b299a
commit 7d89adb
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 107 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -171,8 +171,12 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu
-  groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu
+  GROUPBY_BENCH
+  groupby/group_sum_benchmark.cu
+  groupby/group_nth_benchmark.cu
+  groupby/group_shift_benchmark.cu
+  groupby/group_struct_benchmark.cu
+  groupby/group_no_requests_benchmark.cu
 )
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <memory>
+#include <random>
+
+class Groupby : public cudf::benchmark {
+};
+
+// TODO: put it in a struct so `uniform` can be remade with different min, max
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
+
+void BM_basic_no_requests(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+    auto result = gb_obj.aggregate(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state)
+{
+  BM_basic_no_requests(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, BasicNoRequest)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(10000)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
+
+void BM_pre_sorted_no_requests(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+  auto valid_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size, valid_it);
+
+  auto keys_table  = cudf::table_view({keys});
+  auto sort_order  = cudf::sorted_order(keys_table);
+  auto sorted_keys = cudf::gather(keys_table, *sort_order);
+  // No need to sort values using sort_order because they were generated randomly
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
+    auto result = gb_obj.aggregate(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state)
+{
+  BM_pre_sorted_no_requests(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
@@ -636,23 +636,16 @@ std::unique_ptr<table> groupby(table_view const& keys,
  */
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
-  auto const all_hash_aggregations =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return cudf::has_atomic_support(r.values.type()) and
-             std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-               return is_hash_aggregation(a->kind);
-             });
-    });
-
-  // Currently, structs are not supported in any of hash-based aggregations.
-  // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
-  // TODO: Support structs in hash-based aggregations.
-  auto const has_struct =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return r.values.type().id() == type_id::STRUCT;
-    });
-
-  return all_hash_aggregations && !has_struct;
+  return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+    // Currently, structs are not supported in any of hash-based aggregations.
+    // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
+    // TODO: Support structs in hash-based aggregations.
+    return not(r.values.type().id() == type_id::STRUCT) and
+           cudf::has_atomic_support(r.values.type()) and
+           std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+             return is_hash_aggregation(a->kind);
+           });
+  });
 }
 
 // Hash-based groupby

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles):
             return ["{0}%".format(int(x * 100)) for x in percentiles]
 
         def _format_stats_values(stats_data):
-            return list(map(lambda x: round(x, 6), stats_data))
+            return map(lambda x: round(x, 6), stats_data)
 
         def _describe_numeric(self):
             # mimicking pandas
-            index = (
-                ["count", "mean", "std", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-            data = (
-                [self.count(), self.mean(), self.std(), self.min()]
-                + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist()
-                + [self.max()]
-            )
-            data = _format_stats_values(data)
+            data = {
+                "count": self.count(),
+                "mean": self.mean(),
+                "std": self.std(),
+                "min": self.min(),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .to_numpy(na_value=np.nan)
+                        .tolist(),
+                    )
+                ),
+                "max": self.max(),
+            }
 
             return Series(
-                data=data, index=index, nan_as_null=False, name=self.name,
+                data=_format_stats_values(data.values()),
+                index=data.keys(),
+                nan_as_null=False,
+                name=self.name,
             )
 
         def _describe_timedelta(self):
             # mimicking pandas
-            index = (
-                ["count", "mean", "std", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-
-            data = (
-                [
-                    str(self.count()),
-                    str(self.mean()),
-                    str(self.std()),
-                    str(pd.Timedelta(self.min())),
-                ]
-                + self.quantile(percentiles)
-                .astype("str")
-                .to_numpy(na_value=None)
-                .tolist()
-                + [str(pd.Timedelta(self.max()))]
-            )
+            data = {
+                "count": str(self.count()),
+                "mean": str(self.mean()),
+                "std": str(self.std()),
+                "min": str(pd.Timedelta(self.min())),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .astype("str")
+                        .to_numpy(na_value=np.nan)
+                        .tolist(),
+                    )
+                ),
+                "max": str(pd.Timedelta(self.max())),
+            }
 
             return Series(
-                data=data,
-                index=index,
+                data=data.values(),
+                index=data.keys(),
                 dtype="str",
                 nan_as_null=False,
                 name=self.name,
@@ -3365,51 +3369,55 @@ def _describe_timedelta(self):
         def _describe_categorical(self):
             # blocked by StringColumn/DatetimeColumn support for
             # value_counts/unique
-            index = ["count", "unique", "top", "freq"]
-            val_counts = self.value_counts(ascending=False)
-            data = [self.count(), self.unique().size]
-
-            if data[1] > 0:
-                top, freq = val_counts.index[0], val_counts.iloc[0]
-                data += [str(top), freq]
-            # If the DataFrame is empty, set 'top' and 'freq' to None
-            # to maintain output shape consistency
-            else:
-                data += [None, None]
+            data = {
+                "count": self.count(),
+                "unique": len(self.unique()),
+                "top": None,
+                "freq": None,
+            }
+            if data["count"] > 0:
+                # In case there's a tie, break the tie by sorting the index
+                # and take the top.
+                val_counts = self.value_counts(ascending=False)
+                tied_val_counts = val_counts[
+                    val_counts == val_counts.iloc[0]
+                ].sort_index()
+                data.update(
+                    {
+                        "top": tied_val_counts.index[0],
+                        "freq": tied_val_counts.iloc[0],
+                    }
+                )
 
             return Series(
-                data=data,
+                data=data.values(),
                 dtype="str",
-                index=index,
+                index=data.keys(),
                 nan_as_null=False,
                 name=self.name,
             )
 
         def _describe_timestamp(self):
-
-            index = (
-                ["count", "mean", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-
-            data = (
-                [
-                    str(self.count()),
-                    str(self.mean().to_numpy().astype("datetime64[ns]")),
-                    str(pd.Timestamp(self.min().astype("datetime64[ns]"))),
-                ]
-                + self.quantile(percentiles)
-                .astype("str")
-                .to_numpy(na_value=None)
-                .tolist()
-                + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))]
-            )
+            data = {
+                "count": str(self.count()),
+                "mean": str(pd.Timestamp(self.mean())),
+                "min": str(pd.Timestamp(self.min())),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .astype(self.dtype)
+                        .astype("str")
+                        .to_numpy(na_value=np.nan),
+                    )
+                ),
+                "max": str(pd.Timestamp((self.max()))),
+            }
 
             return Series(
-                data=data,
+                data=data.values(),
                 dtype="str",
-                index=index,
+                index=data.keys(),
                 nan_as_null=False,
                 name=self.name,
             )