rapidsai · rapids-bot · Apr 21, 2021 · Mar 30, 2021 · Mar 30, 2021 · Mar 30, 2021
@@ -199,6 +199,7 @@ add_library(cudf
     src/groupby/sort/group_min.cu
     src/groupby/sort/group_nth_element.cu
     src/groupby/sort/group_nunique.cu
+    src/groupby/sort/group_product.cu
     src/groupby/sort/group_quantiles.cu
     src/groupby/sort/group_std.cu
     src/groupby/sort/group_sum.cu

@@ -314,29 +314,18 @@ struct update_target_element<dictionary32, aggregation::SUM, target_has_nulls, s
   }
 };
 
-// This code will segfault in nvcc/ptxas 10.2 only
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317
-// Enabling only for 2 types does not segfault. Using for unit tests.
-#if (__CUDACC_VER_MAJOR__ == 10) and (__CUDACC_VER_MINOR__ == 2)
-template <typename T>
-constexpr bool is_SOS_supported()
-{
-  return std::is_floating_point<T>::value;
-}
-#else
 template <typename T>
-constexpr bool is_SOS_supported()
+constexpr bool is_product_supported()
 {
   return is_numeric<T>();
 }
-#endif
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
 struct update_target_element<Source,
                              aggregation::SUM_OF_SQUARES,
                              target_has_nulls,
                              source_has_nulls,
-                             std::enable_if_t<is_SOS_supported<Source>()>> {
+                             std::enable_if_t<is_product_supported<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -351,6 +340,26 @@ struct update_target_element<Source,
   }
 };
 
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element<Source,
+                             aggregation::PRODUCT,
+                             target_has_nulls,
+                             source_has_nulls,
+                             std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::PRODUCT>;
+    atomicMul(&target.element<Target>(target_index),
+              static_cast<Target>(source.element<Source>(source_index)));
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
 struct update_target_element<
   Source,
@@ -559,7 +568,8 @@ struct identity_initializer {
             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or
             k == aggregation::ARGMAX or k == aggregation::ARGMIN or
             k == aggregation::SUM_OF_SQUARES or k == aggregation::STD or
-            k == aggregation::VARIANCE);
+            k == aggregation::VARIANCE or
+            (k == aggregation::PRODUCT and is_product_supported<T>()));
   }
 
   template <typename T, aggregation::Kind k>

@@ -503,6 +503,28 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
   return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{});
 }
 
+/**
+ * @brief Overloads for `atomicMul`
+ * reads the `old` located at the `address` in global or shared memory,
+ * computes (old * val), and stores the result back to memory at the same
+ * address. These three operations are performed in one atomic transaction.
+ *
+ * The supported cudf types for `atomicMul` are:
+ * int8_t, int16_t, int32_t, int64_t, float, double, and bool
+ *
+ * All types are implemented by `atomicCAS`.
+ *
+ * @param[in] address The address of old value in global or shared memory
+ * @param[in] val The value to be multiplied
+ *
+ * @returns The old value at `address`
+ */
+template <typename T>
+__forceinline__ __device__ T atomicMul(T* address, T val)
+{
+  return cudf::genericAtomicOperation(address, val, cudf::DeviceProduct{});
+}
+
 /**
  * @brief Overloads for `atomicMin`
  * reads the `old` located at the `address` in global or shared memory,

@@ -55,33 +55,37 @@ namespace groupby {
 namespace detail {
 namespace hash {
 namespace {
-// This is a temporary fix due to compiler bug and we can resort back to
-// constexpr once cuda 10.2 becomes RAPIDS's minimum compiler version
-#if 0
+
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
  */
-constexpr std::array<aggregation::Kind, 10> hash_aggregations{
-    aggregation::SUM, aggregation::MIN, aggregation::MAX,
-    aggregation::COUNT_VALID, aggregation::COUNT_ALL,
-    aggregation::ARGMIN, aggregation::ARGMAX,
-    aggregation::SUM_OF_SQUARES,
-    aggregation::MEAN, aggregation::STD, aggregation::VARIANCE};
-
-//Could be hash: SUM, PRODUCT, MIN, MAX, COUNT_VALID, COUNT_ALL, ANY, ALL,
+constexpr std::array<aggregation::Kind, 12> hash_aggregations{aggregation::SUM,
+                                                              aggregation::PRODUCT,
+                                                              aggregation::MIN,
+                                                              aggregation::MAX,
+                                                              aggregation::COUNT_VALID,
+                                                              aggregation::COUNT_ALL,
+                                                              aggregation::ARGMIN,
+                                                              aggregation::ARGMAX,
+                                                              aggregation::SUM_OF_SQUARES,
+                                                              aggregation::MEAN,
+                                                              aggregation::STD,
+                                                              aggregation::VARIANCE};
+
+// Could be hash: SUM, PRODUCT, MIN, MAX, COUNT_VALID, COUNT_ALL, ANY, ALL,
 // Compound: MEAN(SUM, COUNT_VALID), VARIANCE, STD(MEAN (SUM, COUNT_VALID), COUNT_VALID),
 // ARGMAX, ARGMIN
 // FIXME(kn): adding SUM_OF_SQUARES causes ptxas compiler crash (<=CUDA 10.2) for more than 3 types!
 
 template <class T, size_t N>
-constexpr bool array_contains(std::array<T, N> const& haystack, T needle) {
+constexpr bool array_contains(std::array<T, N> const& haystack, T needle)
+{
   for (auto i = 0u; i < N; ++i) {
     if (haystack[i] == needle) return true;
   }
   return false;
 }
-#endif
 
 /**
  * @brief Indicates whether the specified aggregation operation can be computed
@@ -93,14 +97,7 @@ constexpr bool array_contains(std::array<T, N> const& haystack, T needle) {
  */
 bool constexpr is_hash_aggregation(aggregation::Kind t)
 {
-  // this is a temporary fix due to compiler bug and we can resort back to
-  // constexpr once cuda 10.2 becomes RAPIDS's minimum compiler version
-  // return array_contains(hash_aggregations, t);
-  return (t == aggregation::SUM) or (t == aggregation::MIN) or (t == aggregation::MAX) or
-         (t == aggregation::COUNT_VALID) or (t == aggregation::COUNT_ALL) or
-         (t == aggregation::ARGMIN) or (t == aggregation::ARGMAX) or
-         (t == aggregation::SUM_OF_SQUARES) or (t == aggregation::MEAN) or
-         (t == aggregation::STD) or (t == aggregation::VARIANCE);
+  return array_contains(hash_aggregations, t);
 }
 
 template <typename Map>

@@ -98,6 +98,18 @@ void aggregrate_result_functor::operator()<aggregation::SUM>(aggregation const&
       get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 };
 
+template <>
+void aggregrate_result_functor::operator()<aggregation::PRODUCT>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_product(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
+};
+
 template <>
 void aggregrate_result_functor::operator()<aggregation::ARGMAX>(aggregation const& agg)
 {

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <groupby/sort/group_single_pass_reduction_util.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> group_product(column_view const& values,
+                                      size_type num_groups,
+                                      cudf::device_span<size_type const> group_labels,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  auto values_type = cudf::is_dictionary(values.type())
+                       ? dictionary_column_view(values).keys().type()
+                       : values.type();
+  return type_dispatcher(values_type,
+                         reduce_functor<aggregation::PRODUCT>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf