Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix support for different input and output types in linalg::reduce #296

Merged
merged 1 commit into from
Jul 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/strided_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,

///@todo: this complication should go away once we have eliminated the need
/// for atomics in stridedKernel (redesign for this is already underway)
if (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
std::is_same<InType, OutType>::value)
if constexpr (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
std::is_same<InType, OutType>::value)
stridedSummationKernel<InType>
<<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
else
Expand Down
81 changes: 58 additions & 23 deletions cpp/test/linalg/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,43 +25,48 @@
namespace raft {
namespace linalg {

template <typename T>
template <typename InType, typename OutType>
struct ReduceInputs {
T tolerance;
OutType tolerance;
int rows, cols;
bool rowMajor, alongRows;
unsigned long long int seed;
};

template <typename T>
::std::ostream &operator<<(::std::ostream &os, const ReduceInputs<T> &dims) {
template <typename InType, typename OutType>
::std::ostream &operator<<(::std::ostream &os,
const ReduceInputs<InType, OutType> &dims) {
return os;
}

// Or else, we get the following compilation error
// for an extended __device__ lambda cannot have private or protected access
// within its class
template <typename T>
void reduceLaunch(T *dots, const T *data, int cols, int rows, bool rowMajor,
bool alongRows, bool inplace, cudaStream_t stream) {
reduce(dots, data, cols, rows, (T)0, rowMajor, alongRows, stream, inplace,
[] __device__(T in, int i) { return in * in; });
template <typename InType, typename OutType>
void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
bool rowMajor, bool alongRows, bool inplace,
cudaStream_t stream) {
reduce(
dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
[] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
}

template <typename T>
class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
template <typename InType, typename OutType>
class ReduceTest
: public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
protected:
void SetUp() override {
CUDA_CHECK(cudaStreamCreate(&stream));
params = ::testing::TestWithParam<ReduceInputs<T>>::GetParam();
params =
::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
raft::random::Rng r(params.seed);
int rows = params.rows, cols = params.cols;
int len = rows * cols;
outlen = params.alongRows ? rows : cols;
raft::allocate(data, len);
raft::allocate(dots_exp, outlen);
raft::allocate(dots_act, outlen);
r.uniform(data, len, T(-1.0), T(1.0), stream);
r.uniform(data, len, InType(-1.0), InType(1.0), stream);
naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
params.alongRows, stream);

Expand All @@ -84,13 +89,14 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
}

protected:
ReduceInputs<T> params;
T *data, *dots_exp, *dots_act;
ReduceInputs<InType, OutType> params;
InType *data;
OutType *dots_exp, *dots_act;
int outlen;
cudaStream_t stream;
};

const std::vector<ReduceInputs<float>> inputsf = {
const std::vector<ReduceInputs<float, float>> inputsff = {
{0.000002f, 1024, 32, true, true, 1234ULL},
{0.000002f, 1024, 64, true, true, 1234ULL},
{0.000002f, 1024, 128, true, true, 1234ULL},
Expand All @@ -108,7 +114,7 @@ const std::vector<ReduceInputs<float>> inputsf = {
{0.000002f, 1024, 128, false, false, 1234ULL},
{0.000002f, 1024, 256, false, false, 1234ULL}};

const std::vector<ReduceInputs<double>> inputsd = {
const std::vector<ReduceInputs<double, double>> inputsdd = {
{0.000000001, 1024, 32, true, true, 1234ULL},
{0.000000001, 1024, 64, true, true, 1234ULL},
{0.000000001, 1024, 128, true, true, 1234ULL},
Expand All @@ -126,21 +132,50 @@ const std::vector<ReduceInputs<double>> inputsd = {
{0.000000001, 1024, 128, false, false, 1234ULL},
{0.000000001, 1024, 256, false, false, 1234ULL}};

typedef ReduceTest<float> ReduceTestF;
TEST_P(ReduceTestF, Result) {
const std::vector<ReduceInputs<float, double>> inputsfd = {
{0.000002f, 1024, 32, true, true, 1234ULL},
{0.000002f, 1024, 64, true, true, 1234ULL},
{0.000002f, 1024, 128, true, true, 1234ULL},
{0.000002f, 1024, 256, true, true, 1234ULL},
{0.000002f, 1024, 32, true, false, 1234ULL},
{0.000002f, 1024, 64, true, false, 1234ULL},
{0.000002f, 1024, 128, true, false, 1234ULL},
{0.000002f, 1024, 256, true, false, 1234ULL},
{0.000002f, 1024, 32, false, true, 1234ULL},
{0.000002f, 1024, 64, false, true, 1234ULL},
{0.000002f, 1024, 128, false, true, 1234ULL},
{0.000002f, 1024, 256, false, true, 1234ULL},
{0.000002f, 1024, 32, false, false, 1234ULL},
{0.000002f, 1024, 64, false, false, 1234ULL},
{0.000002f, 1024, 128, false, false, 1234ULL},
{0.000002f, 1024, 256, false, false, 1234ULL}};

typedef ReduceTest<float, float> ReduceTestFF;
TEST_P(ReduceTestFF, Result) {
ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
raft::CompareApprox<float>(params.tolerance)));
}

typedef ReduceTest<double> ReduceTestD;
TEST_P(ReduceTestD, Result) {
typedef ReduceTest<double, double> ReduceTestDD;
TEST_P(ReduceTestDD, Result) {
ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
raft::CompareApprox<double>(params.tolerance)));
}

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestF, ::testing::ValuesIn(inputsf));
typedef ReduceTest<float, double> ReduceTestFD;
TEST_P(ReduceTestFD, Result) {
ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
raft::CompareApprox<double>(params.tolerance)));
}

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
::testing::ValuesIn(inputsff));

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
::testing::ValuesIn(inputsdd));

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestD, ::testing::ValuesIn(inputsd));
INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
::testing::ValuesIn(inputsfd));

} // end namespace linalg
} // end namespace raft
35 changes: 19 additions & 16 deletions cpp/test/linalg/reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#pragma once

#include <cublas_v2.h>
#include <raft/linalg/cublas_wrappers.h>
#include <thrust/device_vector.h>
Expand All @@ -23,52 +25,53 @@
namespace raft {
namespace linalg {

template <typename Type>
__global__ void naiveCoalescedReductionKernel(Type *dots, const Type *data,
template <typename InType, typename OutType>
__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
int D, int N) {
Type acc = (Type)0;
OutType acc = (OutType)0;
int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
if (rowStart < N) {
for (int i = 0; i < D; ++i) {
acc += data[rowStart * D + i] * data[rowStart * D + i];
acc +=
static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
}
dots[rowStart] = 2 * acc;
}
}

template <typename Type>
void naiveCoalescedReduction(Type *dots, const Type *data, int D, int N,
template <typename InType, typename OutType>
void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
cudaStream_t stream) {
static const int TPB = 64;
int nblks = raft::ceildiv(N, TPB);
naiveCoalescedReductionKernel<Type>
naiveCoalescedReductionKernel<InType, OutType>
<<<nblks, TPB, 0, stream>>>(dots, data, D, N);
CUDA_CHECK(cudaPeekAtLastError());
}

template <typename Type>
void unaryAndGemv(Type *dots, const Type *data, int D, int N,
template <typename InType, typename OutType>
void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
cudaStream_t stream) {
//computes a MLCommon unary op on data (squares it), then computes Ax
//(A input matrix and x column vector) to sum columns
thrust::device_vector<Type> sq(D * N);
thrust::device_vector<OutType> sq(D * N);
raft::linalg::unaryOp(
thrust::raw_pointer_cast(sq.data()), data, D * N,
[] __device__(Type v) { return v * v; }, stream);
[] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));
thrust::device_vector<Type> ones(N, 1); //column vector [1...1]
Type alpha = 1, beta = 0;
thrust::device_vector<OutType> ones(N, 1); //column vector [1...1]
OutType alpha = 1, beta = 0;
CUBLAS_CHECK(raft::linalg::cublasgemv(
handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
CUDA_CHECK(cudaDeviceSynchronize());
CUBLAS_CHECK(cublasDestroy(handle));
}

template <typename Type>
void naiveReduction(Type *dots, const Type *data, int D, int N, bool rowMajor,
bool alongRows, cudaStream_t stream) {
template <typename InType, typename OutType>
void naiveReduction(OutType *dots, const InType *data, int D, int N,
bool rowMajor, bool alongRows, cudaStream_t stream) {
if (rowMajor && alongRows) {
naiveCoalescedReduction(dots, data, D, N, stream);
} else if (rowMajor && !alongRows) {
Expand Down