intel · bader · Jul 29, 2020 · Jul 23, 2020 · Jul 28, 2020 · Jul 28, 2020
@@ -147,18 +147,17 @@ using IsKnownIdentityOp =
 template <typename T, class BinaryOperation, typename Subst = void>
 class reducer {
 public:
-  reducer(const T &Identity) : MValue(Identity), MIdentity(Identity) {}
-  void combine(const T &Partial) {
-    BinaryOperation BOp;
-    MValue = BOp(MValue, Partial);
-  }
+  reducer(const T &Identity, BinaryOperation BOp)
+      : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {}
+  void combine(const T &Partial) { MValue = MBinaryOp(MValue, Partial); }
 
   T getIdentity() const { return MIdentity; }
 
   T MValue;
 
 private:
   const T MIdentity;
+  BinaryOperation MBinaryOp;
 };
 
 /// Specialization of the generic class 'reducer'. It is used for reductions
@@ -183,7 +182,7 @@ class reducer<T, BinaryOperation,
               enable_if_t<IsKnownIdentityOp<T, BinaryOperation>::value>> {
 public:
   reducer() : MValue(getIdentity()) {}
-  reducer(const T &) : MValue(getIdentity()) {}
+  reducer(const T &, BinaryOperation) : MValue(getIdentity()) {}
 
   void combine(const T &Partial) {
     BinaryOperation BOp;
@@ -405,7 +404,7 @@ class reduction_impl {
   template <
       typename _T = T, class _BinaryOperation = BinaryOperation,
       enable_if_t<IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
-  reduction_impl(accessor_type &Acc, const T &Identity)
+  reduction_impl(accessor_type &Acc, const T &Identity, BinaryOperation)
       : MAcc(shared_ptr_class<accessor_type>(shared_ptr_class<accessor_type>{},
                                              &Acc)),
         MIdentity(getIdentity()) {
@@ -431,10 +430,10 @@ class reduction_impl {
   template <
       typename _T = T, class _BinaryOperation = BinaryOperation,
       enable_if_t<!IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
-  reduction_impl(accessor_type &Acc, const T &Identity)
+  reduction_impl(accessor_type &Acc, const T &Identity, BinaryOperation BOp)
       : MAcc(shared_ptr_class<accessor_type>(shared_ptr_class<accessor_type>{},
                                              &Acc)),
-        MIdentity(Identity) {
+        MIdentity(Identity), MBinaryOp(BOp) {
     assert(Acc.get_count() == 1 &&
            "Only scalar/1-element reductions are supported now.");
   }
@@ -456,7 +455,7 @@ class reduction_impl {
   template <
       typename _T = T, class _BinaryOperation = BinaryOperation,
       enable_if_t<IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
-  reduction_impl(T *VarPtr, const T &Identity)
+  reduction_impl(T *VarPtr, const T &Identity, BinaryOperation)
       : MIdentity(Identity), MUSMPointer(VarPtr) {
     // For now the implementation ignores the identity value given by user
     // when the implementation knows the identity.
@@ -478,8 +477,8 @@ class reduction_impl {
   template <
       typename _T = T, class _BinaryOperation = BinaryOperation,
       enable_if_t<!IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
-  reduction_impl(T *VarPtr, const T &Identity)
-      : MIdentity(Identity), MUSMPointer(VarPtr) {}
+  reduction_impl(T *VarPtr, const T &Identity, BinaryOperation BOp)
+      : MIdentity(Identity), MUSMPointer(VarPtr), MBinaryOp(BOp) {}
 
   /// Associates reduction accessor with the given handler and saves reduction
   /// buffer so that it is alive until the command group finishes the work.
@@ -563,6 +562,9 @@ class reduction_impl {
     return OutPtr;
   }
 
+  /// Returns the binary operation associated with the reduction.
+  BinaryOperation getBinaryOperation() const { return MBinaryOp; }
+
 private:
   /// Identity of the BinaryOperation.
   /// The result of BinaryOperation(X, MIdentity) is equal to X for any X.
@@ -576,6 +578,8 @@ class reduction_impl {
   /// USM pointer referencing the memory to where the result of the reduction
   /// must be written. Applicable/used only for USM reductions.
   T *MUSMPointer = nullptr;
+
+  BinaryOperation MBinaryOp;
 };
 
 /// These are the forward declaration for the classes that help to create
@@ -794,9 +798,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
   typename Reduction::result_type ReduIdentity = Redu.getIdentity();
   using Name = typename get_reduction_main_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
+  auto BOp = Redu.getBinaryOperation();
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
     // Call user's functions. Reducer.MValue gets initialized there.
-    typename Reduction::reducer_type Reducer(ReduIdentity);
+    typename Reduction::reducer_type Reducer(ReduIdentity, BOp);
     KernelFunc(NDIt, Reducer);
 
     size_t WGSize = NDIt.get_local_range().size();
@@ -811,7 +816,6 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
     // LocalReds[WGSize] accumulates last/odd elements when the step
     // of tree-reduction loop is not even.
-    typename Reduction::binary_operation BOp;
     size_t PrevStep = WGSize;
     for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
       if (LID < CurStep)
@@ -925,6 +929,7 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
   auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
 
   auto ReduIdentity = Redu.getIdentity();
+  auto BOp = Redu.getBinaryOperation();
   using Name = typename get_reduction_aux_kernel_name_t<
       KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
   nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
@@ -943,7 +948,6 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
     // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
     // LocalReds[WGSize] accumulates last/odd elements when the step
     // of tree-reduction loop is not even.
-    typename Reduction::binary_operation BOp;
     size_t PrevStep = WGSize;
     for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
       if (LID < CurStep)
@@ -1022,10 +1026,10 @@ template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
           access::placeholder IsPH>
 detail::reduction_impl<T, BinaryOperation, Dims, false, AccMode, IsPH>
 reduction(accessor<T, Dims, AccMode, access::target::global_buffer, IsPH> &Acc,
-          const T &Identity, BinaryOperation) {
+          const T &Identity, BinaryOperation BOp) {
   // The Combiner argument was needed only to define the BinaryOperation param.
   return detail::reduction_impl<T, BinaryOperation, Dims, false, AccMode, IsPH>(
-      Acc, Identity);
+      Acc, Identity, BOp);
 }
 
 /// Creates and returns an object implementing the reduction functionality.
@@ -1050,9 +1054,10 @@ reduction(accessor<T, Dims, AccMode, access::target::global_buffer, IsPH> &Acc,
 /// \param Identity, and the binary operation used in the reduction.
 template <typename T, class BinaryOperation>
 detail::reduction_impl<T, BinaryOperation, 0, true, access::mode::read_write>
-reduction(T *VarPtr, const T &Identity, BinaryOperation) {
+reduction(T *VarPtr, const T &Identity, BinaryOperation BOp) {
   return detail::reduction_impl<T, BinaryOperation, 0, true,
-                                access::mode::read_write>(VarPtr, Identity);
+                                access::mode::read_write>(VarPtr, Identity,
+                                                          BOp);
 }
 
 /// Creates and returns an object implementing the reduction functionality.

@@ -23,13 +23,12 @@ void test_reducer(Reduction &Redu, T A, T B) {
          "Wrong result of binary operation.");
 }
 
-template <typename T, typename Reduction>
-void test_reducer(Reduction &Redu, T Identity, T A, T B) {
-  typename Reduction::reducer_type Reducer(Identity);
+template <typename T, typename Reduction, typename BinaryOperation>
+void test_reducer(Reduction &Redu, T Identity, BinaryOperation BOp, T A, T B) {
+  typename Reduction::reducer_type Reducer(Identity, BOp);
   Reducer.combine(A);
   Reducer.combine(B);
 
-  typename Reduction::binary_operation BOp;
   T ExpectedValue = BOp(A, B);
   assert(ExpectedValue == Reducer.MValue &&
          "Wrong result of binary operation.");
@@ -40,35 +39,8 @@ class Known;
 template <typename T, int Dim, class BinaryOperation>
 class Unknown;
 
-template <typename T>
-struct Point {
-  Point() : X(0), Y(0) {}
-  Point(T X, T Y) : X(X), Y(Y) {}
-  Point(T V) : X(V), Y(V) {}
-  bool operator==(const Point &P) const {
-    return P.X == X && P.Y == Y;
-  }
-  T X;
-  T Y;
-};
-
-template <typename T>
-bool operator==(const Point<T> &A, const Point<T> &B) {
-  return A.X == B.X && A.Y == B.Y;
-}
-
-template <class T>
-struct PointPlus {
-  using P = Point<T>;
-  P operator()(const P &A, const P &B) const {
-    return P(A.X + B.X, A.Y + B.Y);
-  }
-};
-
 template <typename T, int Dim, class BinaryOperation>
-void testKnown(T Identity, T A, T B) {
-
-  BinaryOperation BOp;
+void testKnown(T Identity, BinaryOperation BOp, T A, T B) {
   buffer<T, 1> ReduBuf(1);
 
   queue Q;
@@ -81,17 +53,15 @@ void testKnown(T Identity, T A, T B) {
     assert(Redu.getIdentity() == Identity &&
            "Failed getIdentity() check().");
     test_reducer(Redu, A, B);
-    test_reducer(Redu, Identity, A, B);
+    test_reducer(Redu, Identity, BOp, A, B);
 
     // Command group must have at least one task in it. Use an empty one.
     CGH.single_task<Known<T, Dim, BinaryOperation>>([=]() {});
   });
 }
 
-template <typename T, int Dim, class BinaryOperation>
-void testUnknown(T Identity, T A, T B) {
-
-  BinaryOperation BOp;
+template <typename T, int Dim, typename KernelName, class BinaryOperation>
+void testUnknown(T Identity, BinaryOperation BOp, T A, T B) {
   buffer<T, 1> ReduBuf(1);
   queue Q;
   Q.submit([&](handler &CGH) {
@@ -102,38 +72,46 @@ void testUnknown(T Identity, T A, T B) {
     auto Redu = intel::reduction(ReduAcc, Identity, BOp);
     assert(Redu.getIdentity() == Identity &&
            "Failed getIdentity() check().");
-    test_reducer(Redu, Identity, A, B);
+    test_reducer(Redu, Identity, BOp, A, B);
 
     // Command group must have at least one task in it. Use an empty one.
-    CGH.single_task<Unknown<T, Dim, BinaryOperation>>([=]() {});
+    CGH.single_task<KernelName>([=]() {});
   });
 }
 
 template <typename T, class BinaryOperation>
-void testBoth(T Identity, T A, T B) {
-  testKnown<T, 0, BinaryOperation>(Identity, A, B);
-  testKnown<T, 1, BinaryOperation>(Identity, A, B);
-  testUnknown<T, 0, BinaryOperation>(Identity, A, B);
-  testUnknown<T, 1, BinaryOperation>(Identity, A, B);
+void testBoth(T Identity, BinaryOperation BOp, T A, T B) {
+  testKnown<T, 0>(Identity, BOp, A, B);
+  testKnown<T, 1>(Identity, BOp, A, B);
+  testUnknown<T, 0, Unknown<T, 0, BinaryOperation>>(Identity, BOp, A, B);
+  testUnknown<T, 1, Unknown<T, 1, BinaryOperation>>(Identity, BOp, A, B);
 }
 
 int main() {
-  // testKnown does not pass identity to reduction ctor.
-  testBoth<int, intel::plus<int>>(0, 1, 7);
-  testBoth<int, std::multiplies<int>>(1, 1, 7);
-  testBoth<int, intel::bit_or<int>>(0, 1, 8);
-  testBoth<int, intel::bit_xor<int>>(0, 7, 3);
-  testBoth<int, intel::bit_and<int>>(~0, 7, 3);
-  testBoth<int, intel::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
-  testBoth<int, intel::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
-
-  testBoth<float, intel::plus<float>>(0, 1, 7);
-  testBoth<float, std::multiplies<float>>(1, 1, 7);
-  testBoth<float, intel::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
-  testBoth<float, intel::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
-
-  testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
-  testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
+  testBoth<int>(0, intel::plus<int>(), 1, 7);
+  testBoth<int>(1, std::multiplies<int>(), 1, 7);
+  testBoth<int>(0, intel::bit_or<int>(), 1, 8);
+  testBoth<int>(0, intel::bit_xor<int>(), 7, 3);
+  testBoth<int>(~0, intel::bit_and<int>(), 7, 3);
+  testBoth<int>((std::numeric_limits<int>::max)(), intel::minimum<int>(), 7, 3);
+  testBoth<int>((std::numeric_limits<int>::min)(), intel::maximum<int>(), 7, 3);
+
+  testBoth<float>(0, intel::plus<float>(), 1, 7);
+  testBoth<float>(1, std::multiplies<float>(), 1, 7);
+  testBoth<float>(getMaximumFPValue<float>(), intel::minimum<float>(), 7, 3);
+  testBoth<float>(getMinimumFPValue<float>(), intel::maximum<float>(), 7, 3);
+
+  testUnknown<CustomVec<float>, 0,
+              Unknown<CustomVec<float>, 0, CustomVecPlus<float>>>(
+      CustomVec<float>(0), CustomVecPlus<float>(), CustomVec<float>(1),
+      CustomVec<float>(7));
+  testUnknown<CustomVec<float>, 1,
+              Unknown<CustomVec<float>, 1, CustomVecPlus<float>>>(
+      CustomVec<float>(0), CustomVecPlus<float>(), CustomVec<float>(1),
+      CustomVec<float>(7));
+
+  testUnknown<int, 0, class BitOrName>(
+      0, [](auto a, auto b) { return a | b; }, 1, 8);
 
   std::cout << "Test passed\n";
   return 0;

@@ -0,0 +1,72 @@
+// UNSUPPORTED: cuda
+// Reductions use work-group builtins (e.g. intel::reduce()) not yet supported
+// by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, lambda)
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <class KernelName, typename T, class BinaryOperation>
+void test(T Identity, BinaryOperation BOp, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    auto Out = OutBuf.template get_access<access::mode::discard_write>(CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<KernelName>(NDRange, Redu,
+                                 [=](nd_item<1> NDIt, auto &Sum) {
+                                   Sum.combine(In[NDIt.get_global_linear_id()]);
+                                 });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  test<class AddTestName, int>(
+      0, [](auto x, auto y) { return (x + y); }, 8, 32);
+  test<class MulTestName, int>(
+      0, [](auto x, auto y) { return (x * y); }, 8, 32);
+
+  // Check with CUSTOM type.
+  test<class CustomAddTestname, CustomVec<long long>>(
+      CustomVec<long long>(0),
+      [](auto x, auto y) {
+        CustomVecPlus<long long> BOp;
+        return BOp(x, y);
+      },
+      4, 64);
+
+  std::cout << "Test passed\n";
+  return 0;
+}