From 14ff641fe834c0166c1d53e3c88abddd5ec0d29b Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 21 Jul 2021 20:58:02 +0200 Subject: [PATCH] Fix support for different input and output types in linalg::reduce (#296) Closes #165 Uses a C++17 `if constexpr` to discard at compile time a code path that doesn't support different input and output types, and adds a test for such a case (the test won't compile without that `constexpr` keyword). Authors: - Louis Sugy (https://github.com/Nyrio) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/raft/pull/296 --- cpp/include/raft/linalg/strided_reduction.cuh | 4 +- cpp/test/linalg/reduce.cu | 81 +++++++++++++------ cpp/test/linalg/reduce.cuh | 35 ++++---- 3 files changed, 79 insertions(+), 41 deletions(-) diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index fff09d2046..bba652e137 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -147,8 +147,8 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) - if (std::is_same>::value && - std::is_same::value) + if constexpr (std::is_same>::value && + std::is_same::value) stridedSummationKernel <<>>(dots, data, D, N, init, main_op); else diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 255cf1a696..9082397265 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -25,35 +25,40 @@ namespace raft { namespace linalg { -template +template struct ReduceInputs { - T tolerance; + OutType tolerance; int rows, cols; bool rowMajor, alongRows; unsigned long long int seed; }; -template -::std::ostream &operator<<(::std::ostream &os, const ReduceInputs &dims) { +template +::std::ostream &operator<<(::std::ostream &os, + const ReduceInputs &dims) { return os; } // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class -template -void reduceLaunch(T *dots, const T *data, int cols, int rows, bool rowMajor, - bool alongRows, bool inplace, cudaStream_t stream) { - reduce(dots, data, cols, rows, (T)0, rowMajor, alongRows, stream, inplace, - [] __device__(T in, int i) { return in * in; }); +template +void reduceLaunch(OutType *dots, const InType *data, int cols, int rows, + bool rowMajor, bool alongRows, bool inplace, + cudaStream_t stream) { + reduce( + dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace, + [] __device__(InType in, int i) { return static_cast(in * in); }); } -template -class ReduceTest : public ::testing::TestWithParam> { +template +class ReduceTest + : public ::testing::TestWithParam> { protected: void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); - params = ::testing::TestWithParam>::GetParam(); + params = + ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; @@ -61,7 +66,7 @@ class ReduceTest : public ::testing::TestWithParam> { raft::allocate(data, len); raft::allocate(dots_exp, outlen); raft::allocate(dots_act, outlen); - r.uniform(data, len, T(-1.0), T(1.0), stream); + r.uniform(data, len, InType(-1.0), InType(1.0), stream); naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); @@ -84,13 +89,14 @@ class ReduceTest : public ::testing::TestWithParam> { } protected: - ReduceInputs params; - T *data, *dots_exp, *dots_act; + ReduceInputs params; + InType *data; + OutType *dots_exp, *dots_act; int outlen; cudaStream_t stream; }; -const std::vector> inputsf = { +const std::vector> inputsff = { {0.000002f, 1024, 32, true, true, 1234ULL}, {0.000002f, 1024, 64, true, true, 1234ULL}, {0.000002f, 1024, 128, true, true, 1234ULL}, @@ -108,7 +114,7 @@ const std::vector> inputsf = { {0.000002f, 1024, 128, false, false, 1234ULL}, {0.000002f, 1024, 256, false, false, 1234ULL}}; -const std::vector> inputsd = { +const std::vector> inputsdd = { {0.000000001, 1024, 32, true, true, 1234ULL}, {0.000000001, 1024, 64, true, true, 1234ULL}, {0.000000001, 1024, 128, true, true, 1234ULL}, @@ -126,21 +132,50 @@ const std::vector> inputsd = { {0.000000001, 1024, 128, false, false, 1234ULL}, {0.000000001, 1024, 256, false, false, 1234ULL}}; -typedef ReduceTest ReduceTestF; -TEST_P(ReduceTestF, Result) { +const std::vector> inputsfd = { + {0.000002f, 1024, 32, true, true, 1234ULL}, + {0.000002f, 1024, 64, true, true, 1234ULL}, + {0.000002f, 1024, 128, true, true, 1234ULL}, + {0.000002f, 1024, 256, true, true, 1234ULL}, + {0.000002f, 1024, 32, true, false, 1234ULL}, + {0.000002f, 1024, 64, true, false, 1234ULL}, + {0.000002f, 1024, 128, true, false, 1234ULL}, + {0.000002f, 1024, 256, true, false, 1234ULL}, + {0.000002f, 1024, 32, false, true, 1234ULL}, + {0.000002f, 1024, 64, false, true, 1234ULL}, + {0.000002f, 1024, 128, false, true, 1234ULL}, + {0.000002f, 1024, 256, false, true, 1234ULL}, + {0.000002f, 1024, 32, false, false, 1234ULL}, + {0.000002f, 1024, 64, false, false, 1234ULL}, + {0.000002f, 1024, 128, false, false, 1234ULL}, + {0.000002f, 1024, 256, false, false, 1234ULL}}; + +typedef ReduceTest ReduceTestFF; +TEST_P(ReduceTestFF, Result) { ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } -typedef ReduceTest ReduceTestD; -TEST_P(ReduceTestD, Result) { +typedef ReduceTest ReduceTestDD; +TEST_P(ReduceTestDD, Result) { ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestF, ::testing::ValuesIn(inputsf)); +typedef ReduceTest ReduceTestFD; +TEST_P(ReduceTestFD, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, + raft::CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, + ::testing::ValuesIn(inputsff)); + +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, + ::testing::ValuesIn(inputsdd)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, + ::testing::ValuesIn(inputsfd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 18261287cf..30a9c2e271 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include #include @@ -23,42 +25,43 @@ namespace raft { namespace linalg { -template -__global__ void naiveCoalescedReductionKernel(Type *dots, const Type *data, +template +__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data, int D, int N) { - Type acc = (Type)0; + OutType acc = (OutType)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { - acc += data[rowStart * D + i] * data[rowStart * D + i]; + acc += + static_cast(data[rowStart * D + i] * data[rowStart * D + i]); } dots[rowStart] = 2 * acc; } } -template -void naiveCoalescedReduction(Type *dots, const Type *data, int D, int N, +template +void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N, cudaStream_t stream) { static const int TPB = 64; int nblks = raft::ceildiv(N, TPB); - naiveCoalescedReductionKernel + naiveCoalescedReductionKernel <<>>(dots, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); } -template -void unaryAndGemv(Type *dots, const Type *data, int D, int N, +template +void unaryAndGemv(OutType *dots, const InType *data, int D, int N, cudaStream_t stream) { //computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns - thrust::device_vector sq(D * N); + thrust::device_vector sq(D * N); raft::linalg::unaryOp( thrust::raw_pointer_cast(sq.data()), data, D * N, - [] __device__(Type v) { return v * v; }, stream); + [] __device__(InType v) { return static_cast(v * v); }, stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); //column vector [1...1] - Type alpha = 1, beta = 0; + thrust::device_vector ones(N, 1); //column vector [1...1] + OutType alpha = 1, beta = 0; CUBLAS_CHECK(raft::linalg::cublasgemv( handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); @@ -66,9 +69,9 @@ void unaryAndGemv(Type *dots, const Type *data, int D, int N, CUBLAS_CHECK(cublasDestroy(handle)); } -template -void naiveReduction(Type *dots, const Type *data, int D, int N, bool rowMajor, - bool alongRows, cudaStream_t stream) { +template +void naiveReduction(OutType *dots, const InType *data, int D, int N, + bool rowMajor, bool alongRows, cudaStream_t stream) { if (rowMajor && alongRows) { naiveCoalescedReduction(dots, data, D, N, stream); } else if (rowMajor && !alongRows) {