diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp index 4991d7c6e3cbb..26adff47778e9 100644 --- a/sycl/include/CL/sycl/intel/reduction.hpp +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -147,11 +147,9 @@ using IsKnownIdentityOp = template class reducer { public: - reducer(const T &Identity) : MValue(Identity), MIdentity(Identity) {} - void combine(const T &Partial) { - BinaryOperation BOp; - MValue = BOp(MValue, Partial); - } + reducer(const T &Identity, BinaryOperation BOp) + : MValue(Identity), MIdentity(Identity), MBinaryOp(BOp) {} + void combine(const T &Partial) { MValue = MBinaryOp(MValue, Partial); } T getIdentity() const { return MIdentity; } @@ -159,6 +157,7 @@ class reducer { private: const T MIdentity; + BinaryOperation MBinaryOp; }; /// Specialization of the generic class 'reducer'. It is used for reductions @@ -183,7 +182,7 @@ class reducer::value>> { public: reducer() : MValue(getIdentity()) {} - reducer(const T &) : MValue(getIdentity()) {} + reducer(const T &, BinaryOperation) : MValue(getIdentity()) {} void combine(const T &Partial) { BinaryOperation BOp; @@ -405,7 +404,7 @@ class reduction_impl { template < typename _T = T, class _BinaryOperation = BinaryOperation, enable_if_t::value> * = nullptr> - reduction_impl(accessor_type &Acc, const T &Identity) + reduction_impl(accessor_type &Acc, const T &Identity, BinaryOperation) : MAcc(shared_ptr_class(shared_ptr_class{}, &Acc)), MIdentity(getIdentity()) { @@ -431,10 +430,10 @@ class reduction_impl { template < typename _T = T, class _BinaryOperation = BinaryOperation, enable_if_t::value> * = nullptr> - reduction_impl(accessor_type &Acc, const T &Identity) + reduction_impl(accessor_type &Acc, const T &Identity, BinaryOperation BOp) : MAcc(shared_ptr_class(shared_ptr_class{}, &Acc)), - MIdentity(Identity) { + MIdentity(Identity), MBinaryOp(BOp) { assert(Acc.get_count() == 1 && "Only scalar/1-element reductions are supported now."); } @@ -456,7 +455,7 @@ class reduction_impl { template < typename _T = T, class _BinaryOperation = BinaryOperation, enable_if_t::value> * = nullptr> - reduction_impl(T *VarPtr, const T &Identity) + reduction_impl(T *VarPtr, const T &Identity, BinaryOperation) : MIdentity(Identity), MUSMPointer(VarPtr) { // For now the implementation ignores the identity value given by user // when the implementation knows the identity. @@ -478,8 +477,8 @@ class reduction_impl { template < typename _T = T, class _BinaryOperation = BinaryOperation, enable_if_t::value> * = nullptr> - reduction_impl(T *VarPtr, const T &Identity) - : MIdentity(Identity), MUSMPointer(VarPtr) {} + reduction_impl(T *VarPtr, const T &Identity, BinaryOperation BOp) + : MIdentity(Identity), MUSMPointer(VarPtr), MBinaryOp(BOp) {} /// Associates reduction accessor with the given handler and saves reduction /// buffer so that it is alive until the command group finishes the work. @@ -563,6 +562,9 @@ class reduction_impl { return OutPtr; } + /// Returns the binary operation associated with the reduction. + BinaryOperation getBinaryOperation() const { return MBinaryOp; } + private: /// Identity of the BinaryOperation. /// The result of BinaryOperation(X, MIdentity) is equal to X for any X. @@ -576,6 +578,8 @@ class reduction_impl { /// USM pointer referencing the memory to where the result of the reduction /// must be written. Applicable/used only for USM reductions. T *MUSMPointer = nullptr; + + BinaryOperation MBinaryOp; }; /// These are the forward declaration for the classes that help to create @@ -794,9 +798,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, typename Reduction::result_type ReduIdentity = Redu.getIdentity(); using Name = typename get_reduction_main_kernel_name_t< KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name; + auto BOp = Redu.getBinaryOperation(); CGH.parallel_for(Range, [=](nd_item NDIt) { // Call user's functions. Reducer.MValue gets initialized there. - typename Reduction::reducer_type Reducer(ReduIdentity); + typename Reduction::reducer_type Reducer(ReduIdentity, BOp); KernelFunc(NDIt, Reducer); size_t WGSize = NDIt.get_local_range().size(); @@ -811,7 +816,6 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] // LocalReds[WGSize] accumulates last/odd elements when the step // of tree-reduction loop is not even. - typename Reduction::binary_operation BOp; size_t PrevStep = WGSize; for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { if (LID < CurStep) @@ -925,6 +929,7 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); auto ReduIdentity = Redu.getIdentity(); + auto BOp = Redu.getBinaryOperation(); using Name = typename get_reduction_aux_kernel_name_t< KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name; nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)}; @@ -943,7 +948,6 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] // LocalReds[WGSize] accumulates last/odd elements when the step // of tree-reduction loop is not even. - typename Reduction::binary_operation BOp; size_t PrevStep = WGSize; for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { if (LID < CurStep) @@ -1022,10 +1026,10 @@ template detail::reduction_impl reduction(accessor &Acc, - const T &Identity, BinaryOperation) { + const T &Identity, BinaryOperation BOp) { // The Combiner argument was needed only to define the BinaryOperation param. return detail::reduction_impl( - Acc, Identity); + Acc, Identity, BOp); } /// Creates and returns an object implementing the reduction functionality. @@ -1050,9 +1054,10 @@ reduction(accessor &Acc, /// \param Identity, and the binary operation used in the reduction. template detail::reduction_impl -reduction(T *VarPtr, const T &Identity, BinaryOperation) { +reduction(T *VarPtr, const T &Identity, BinaryOperation BOp) { return detail::reduction_impl(VarPtr, Identity); + access::mode::read_write>(VarPtr, Identity, + BOp); } /// Creates and returns an object implementing the reduction functionality. diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp index 7f8e8e9726e59..cbf1b5907cfc2 100644 --- a/sycl/test/reduction/reduction_ctor.cpp +++ b/sycl/test/reduction/reduction_ctor.cpp @@ -23,13 +23,12 @@ void test_reducer(Reduction &Redu, T A, T B) { "Wrong result of binary operation."); } -template -void test_reducer(Reduction &Redu, T Identity, T A, T B) { - typename Reduction::reducer_type Reducer(Identity); +template +void test_reducer(Reduction &Redu, T Identity, BinaryOperation BOp, T A, T B) { + typename Reduction::reducer_type Reducer(Identity, BOp); Reducer.combine(A); Reducer.combine(B); - typename Reduction::binary_operation BOp; T ExpectedValue = BOp(A, B); assert(ExpectedValue == Reducer.MValue && "Wrong result of binary operation."); @@ -40,35 +39,8 @@ class Known; template class Unknown; -template -struct Point { - Point() : X(0), Y(0) {} - Point(T X, T Y) : X(X), Y(Y) {} - Point(T V) : X(V), Y(V) {} - bool operator==(const Point &P) const { - return P.X == X && P.Y == Y; - } - T X; - T Y; -}; - -template -bool operator==(const Point &A, const Point &B) { - return A.X == B.X && A.Y == B.Y; -} - -template -struct PointPlus { - using P = Point; - P operator()(const P &A, const P &B) const { - return P(A.X + B.X, A.Y + B.Y); - } -}; - template -void testKnown(T Identity, T A, T B) { - - BinaryOperation BOp; +void testKnown(T Identity, BinaryOperation BOp, T A, T B) { buffer ReduBuf(1); queue Q; @@ -81,17 +53,15 @@ void testKnown(T Identity, T A, T B) { assert(Redu.getIdentity() == Identity && "Failed getIdentity() check()."); test_reducer(Redu, A, B); - test_reducer(Redu, Identity, A, B); + test_reducer(Redu, Identity, BOp, A, B); // Command group must have at least one task in it. Use an empty one. CGH.single_task>([=]() {}); }); } -template -void testUnknown(T Identity, T A, T B) { - - BinaryOperation BOp; +template +void testUnknown(T Identity, BinaryOperation BOp, T A, T B) { buffer ReduBuf(1); queue Q; Q.submit([&](handler &CGH) { @@ -102,38 +72,46 @@ void testUnknown(T Identity, T A, T B) { auto Redu = intel::reduction(ReduAcc, Identity, BOp); assert(Redu.getIdentity() == Identity && "Failed getIdentity() check()."); - test_reducer(Redu, Identity, A, B); + test_reducer(Redu, Identity, BOp, A, B); // Command group must have at least one task in it. Use an empty one. - CGH.single_task>([=]() {}); + CGH.single_task([=]() {}); }); } template -void testBoth(T Identity, T A, T B) { - testKnown(Identity, A, B); - testKnown(Identity, A, B); - testUnknown(Identity, A, B); - testUnknown(Identity, A, B); +void testBoth(T Identity, BinaryOperation BOp, T A, T B) { + testKnown(Identity, BOp, A, B); + testKnown(Identity, BOp, A, B); + testUnknown>(Identity, BOp, A, B); + testUnknown>(Identity, BOp, A, B); } int main() { - // testKnown does not pass identity to reduction ctor. - testBoth>(0, 1, 7); - testBoth>(1, 1, 7); - testBoth>(0, 1, 8); - testBoth>(0, 7, 3); - testBoth>(~0, 7, 3); - testBoth>((std::numeric_limits::max)(), 7, 3); - testBoth>((std::numeric_limits::min)(), 7, 3); - - testBoth>(0, 1, 7); - testBoth>(1, 1, 7); - testBoth>(getMaximumFPValue(), 7, 3); - testBoth>(getMinimumFPValue(), 7, 3); - - testUnknown, 0, PointPlus>(Point(0), Point(1), Point(7)); - testUnknown, 1, PointPlus>(Point(0), Point(1), Point(7)); + testBoth(0, intel::plus(), 1, 7); + testBoth(1, std::multiplies(), 1, 7); + testBoth(0, intel::bit_or(), 1, 8); + testBoth(0, intel::bit_xor(), 7, 3); + testBoth(~0, intel::bit_and(), 7, 3); + testBoth((std::numeric_limits::max)(), intel::minimum(), 7, 3); + testBoth((std::numeric_limits::min)(), intel::maximum(), 7, 3); + + testBoth(0, intel::plus(), 1, 7); + testBoth(1, std::multiplies(), 1, 7); + testBoth(getMaximumFPValue(), intel::minimum(), 7, 3); + testBoth(getMinimumFPValue(), intel::maximum(), 7, 3); + + testUnknown, 0, + Unknown, 0, CustomVecPlus>>( + CustomVec(0), CustomVecPlus(), CustomVec(1), + CustomVec(7)); + testUnknown, 1, + Unknown, 1, CustomVecPlus>>( + CustomVec(0), CustomVecPlus(), CustomVec(1), + CustomVec(7)); + + testUnknown( + 0, [](auto a, auto b) { return a | b; }, 1, 8); std::cout << "Test passed\n"; return 0; diff --git a/sycl/test/reduction/reduction_nd_lambda.cpp b/sycl/test/reduction/reduction_nd_lambda.cpp new file mode 100644 index 0000000000000..3d5cf21658995 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_lambda.cpp @@ -0,0 +1,72 @@ +// UNSUPPORTED: cuda +// Reductions use work-group builtins (e.g. intel::reduce()) not yet supported +// by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, lambda) + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +void test(T Identity, BinaryOperation BOp, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + auto Out = OutBuf.template get_access(CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for(NDRange, Redu, + [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + test( + 0, [](auto x, auto y) { return (x + y); }, 8, 32); + test( + 0, [](auto x, auto y) { return (x * y); }, 8, 32); + + // Check with CUSTOM type. + test>( + CustomVec(0), + [](auto x, auto y) { + CustomVecPlus BOp; + return BOp(x, y); + }, + 4, 64); + + std::cout << "Test passed\n"; + return 0; +}