rapidsai · rapids-bot · Jul 21, 2021 · Jul 20, 2021
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -147,8 +147,8 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
-  if (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
-      std::is_same<InType, OutType>::value)
+  if constexpr (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
+                std::is_same<InType, OutType>::value)
     stridedSummationKernel<InType>
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
   else

diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
@@ -25,43 +25,48 @@
 namespace raft {
 namespace linalg {
 
-template <typename T>
+template <typename InType, typename OutType>
 struct ReduceInputs {
-  T tolerance;
+  OutType tolerance;
   int rows, cols;
   bool rowMajor, alongRows;
   unsigned long long int seed;
 };
 
-template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const ReduceInputs<T> &dims) {
+template <typename InType, typename OutType>
+::std::ostream &operator<<(::std::ostream &os,
+                           const ReduceInputs<InType, OutType> &dims) {
   return os;
 }
 
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename T>
-void reduceLaunch(T *dots, const T *data, int cols, int rows, bool rowMajor,
-                  bool alongRows, bool inplace, cudaStream_t stream) {
-  reduce(dots, data, cols, rows, (T)0, rowMajor, alongRows, stream, inplace,
-         [] __device__(T in, int i) { return in * in; });
+template <typename InType, typename OutType>
+void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
+                  bool rowMajor, bool alongRows, bool inplace,
+                  cudaStream_t stream) {
+  reduce(
+    dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
+    [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
 }
 
-template <typename T>
-class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
+template <typename InType, typename OutType>
+class ReduceTest
+  : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
  protected:
   void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
-    params = ::testing::TestWithParam<ReduceInputs<T>>::GetParam();
+    params =
+      ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     outlen = params.alongRows ? rows : cols;
     raft::allocate(data, len);
     raft::allocate(dots_exp, outlen);
     raft::allocate(dots_act, outlen);
-    r.uniform(data, len, T(-1.0), T(1.0), stream);
+    r.uniform(data, len, InType(-1.0), InType(1.0), stream);
     naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
                    params.alongRows, stream);
 
@@ -84,13 +89,14 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
   }
 
  protected:
-  ReduceInputs<T> params;
-  T *data, *dots_exp, *dots_act;
+  ReduceInputs<InType, OutType> params;
+  InType *data;
+  OutType *dots_exp, *dots_act;
   int outlen;
   cudaStream_t stream;
 };
 
-const std::vector<ReduceInputs<float>> inputsf = {
+const std::vector<ReduceInputs<float, float>> inputsff = {
   {0.000002f, 1024, 32, true, true, 1234ULL},
   {0.000002f, 1024, 64, true, true, 1234ULL},
   {0.000002f, 1024, 128, true, true, 1234ULL},
@@ -108,7 +114,7 @@ const std::vector<ReduceInputs<float>> inputsf = {
   {0.000002f, 1024, 128, false, false, 1234ULL},
   {0.000002f, 1024, 256, false, false, 1234ULL}};
 
-const std::vector<ReduceInputs<double>> inputsd = {
+const std::vector<ReduceInputs<double, double>> inputsdd = {
   {0.000000001, 1024, 32, true, true, 1234ULL},
   {0.000000001, 1024, 64, true, true, 1234ULL},
   {0.000000001, 1024, 128, true, true, 1234ULL},
@@ -126,21 +132,50 @@ const std::vector<ReduceInputs<double>> inputsd = {
   {0.000000001, 1024, 128, false, false, 1234ULL},
   {0.000000001, 1024, 256, false, false, 1234ULL}};
 
-typedef ReduceTest<float> ReduceTestF;
-TEST_P(ReduceTestF, Result) {
+const std::vector<ReduceInputs<float, double>> inputsfd = {
+  {0.000002f, 1024, 32, true, true, 1234ULL},
+  {0.000002f, 1024, 64, true, true, 1234ULL},
+  {0.000002f, 1024, 128, true, true, 1234ULL},
+  {0.000002f, 1024, 256, true, true, 1234ULL},
+  {0.000002f, 1024, 32, true, false, 1234ULL},
+  {0.000002f, 1024, 64, true, false, 1234ULL},
+  {0.000002f, 1024, 128, true, false, 1234ULL},
+  {0.000002f, 1024, 256, true, false, 1234ULL},
+  {0.000002f, 1024, 32, false, true, 1234ULL},
+  {0.000002f, 1024, 64, false, true, 1234ULL},
+  {0.000002f, 1024, 128, false, true, 1234ULL},
+  {0.000002f, 1024, 256, false, true, 1234ULL},
+  {0.000002f, 1024, 32, false, false, 1234ULL},
+  {0.000002f, 1024, 64, false, false, 1234ULL},
+  {0.000002f, 1024, 128, false, false, 1234ULL},
+  {0.000002f, 1024, 256, false, false, 1234ULL}};
+
+typedef ReduceTest<float, float> ReduceTestFF;
+TEST_P(ReduceTestFF, Result) {
   ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 
-typedef ReduceTest<double> ReduceTestD;
-TEST_P(ReduceTestD, Result) {
+typedef ReduceTest<double, double> ReduceTestDD;
+TEST_P(ReduceTestDD, Result) {
   ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestF, ::testing::ValuesIn(inputsf));
+typedef ReduceTest<float, double> ReduceTestFD;
+TEST_P(ReduceTestFD, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+                          raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
+                        ::testing::ValuesIn(inputsff));
+
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
+                        ::testing::ValuesIn(inputsdd));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
+                        ::testing::ValuesIn(inputsfd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cublas_v2.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <thrust/device_vector.h>
@@ -23,52 +25,53 @@
 namespace raft {
 namespace linalg {
 
-template <typename Type>
-__global__ void naiveCoalescedReductionKernel(Type *dots, const Type *data,
+template <typename InType, typename OutType>
+__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
                                               int D, int N) {
-  Type acc = (Type)0;
+  OutType acc = (OutType)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
-      acc += data[rowStart * D + i] * data[rowStart * D + i];
+      acc +=
+        static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
     }
     dots[rowStart] = 2 * acc;
   }
 }
 
-template <typename Type>
-void naiveCoalescedReduction(Type *dots, const Type *data, int D, int N,
+template <typename InType, typename OutType>
+void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
                              cudaStream_t stream) {
   static const int TPB = 64;
   int nblks = raft::ceildiv(N, TPB);
-  naiveCoalescedReductionKernel<Type>
+  naiveCoalescedReductionKernel<InType, OutType>
     <<<nblks, TPB, 0, stream>>>(dots, data, D, N);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-template <typename Type>
-void unaryAndGemv(Type *dots, const Type *data, int D, int N,
+template <typename InType, typename OutType>
+void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
                   cudaStream_t stream) {
   //computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
-  thrust::device_vector<Type> sq(D * N);
+  thrust::device_vector<OutType> sq(D * N);
   raft::linalg::unaryOp(
     thrust::raw_pointer_cast(sq.data()), data, D * N,
-    [] __device__(Type v) { return v * v; }, stream);
+    [] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  thrust::device_vector<Type> ones(N, 1);  //column vector [1...1]
-  Type alpha = 1, beta = 0;
+  thrust::device_vector<OutType> ones(N, 1);  //column vector [1...1]
+  OutType alpha = 1, beta = 0;
   CUBLAS_CHECK(raft::linalg::cublasgemv(
     handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
     thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
 
-template <typename Type>
-void naiveReduction(Type *dots, const Type *data, int D, int N, bool rowMajor,
-                    bool alongRows, cudaStream_t stream) {
+template <typename InType, typename OutType>
+void naiveReduction(OutType *dots, const InType *data, int D, int N,
+                    bool rowMajor, bool alongRows, cudaStream_t stream) {
   if (rowMajor && alongRows) {
     naiveCoalescedReduction(dots, data, D, N, stream);
   } else if (rowMajor && !alongRows) {