Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LinAlg impl in detail #383

Merged
merged 27 commits into from
Feb 8, 2022
Merged
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
478ddac
working through
divyegala Oct 21, 2021
d4b72ba
working ththrough
divyegala Nov 5, 2021
b472870
linalg detail
divyegala Nov 17, 2021
3bd9645
merging branch 22.02
divyegala Nov 17, 2021
788ffa8
style fix
divyegala Nov 17, 2021
f7d43b5
correcting include
divyegala Nov 17, 2021
282cd48
merging branch-21.12
divyegala Nov 17, 2021
37596c9
Merge remote-tracking branch 'upstream/branch-22.02' into imp-21.12-l…
divyegala Nov 17, 2021
cd4e1f9
merging upstream
divyegala Dec 14, 2021
9c0d655
removing deleted file again
divyegala Dec 14, 2021
a071d09
correcting merges and passing tests
divyegala Dec 14, 2021
db817f6
changing h extensions to hpp
divyegala Dec 14, 2021
abec4d2
cublas/cusolver only in detail, wrap up rest of linalg
divyegala Dec 22, 2021
b424cf1
merging upstream
divyegala Dec 22, 2021
34b2439
correcting doxygen build
divyegala Dec 22, 2021
897e6f7
correcting wrong docs
divyegala Dec 22, 2021
3d4b5f1
review feedback
divyegala Jan 11, 2022
4163619
merging branch-22.02
divyegala Jan 25, 2022
8ff01a9
Merge remote-tracking branch 'upstream/branch-22.04' into imp-21.12-l…
divyegala Jan 25, 2022
b6471d6
review changes
divyegala Jan 26, 2022
5d8c176
more macro renames
divyegala Jan 27, 2022
14cddfc
adding explict stream set back to cublas and cusolver wrappers
divyegala Feb 2, 2022
a2f670f
resolving errors
divyegala Feb 2, 2022
89bf3c1
adding set stream to cublas set pointer mode
divyegala Feb 4, 2022
3c5d303
Merge branch 'branch-22.04' into imp-linalg-public
cjnolet Feb 4, 2022
5759c80
Merge branch 'branch-22.04' into imp-21.12-linalg_detail
cjnolet Feb 7, 2022
f94beef
Fixing a bad merge
cjnolet Feb 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
linalg detail
divyegala committed Nov 17, 2021

Unverified

This user has not yet uploaded their public signing key.
commit b472870b53770411edab225f382dc0a658c8f734
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/correlation.cuh
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@
#pragma once
#include <raft/cuda_utils.cuh>
#include <raft/distance/detail/pairwise_distance_base.cuh>
#include <raft/linalg/reduce.cuh>
#include <raft/linalg/reduce.hpp>

namespace raft {
namespace distance {
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/cosine.cuh
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@
#pragma once

#include <raft/distance/detail/pairwise_distance_base.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/linalg/norm.hpp>

namespace raft {
namespace distance {
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/euclidean.cuh
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@

#pragma once
#include <raft/distance/detail/pairwise_distance_base.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/linalg/norm.hpp>

namespace raft {
namespace distance {
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/fused_l2_nn.cuh
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@
#include <limits>
#include <raft/cuda_utils.cuh>
#include <raft/distance/detail/pairwise_distance_base.cuh>
#include <raft/linalg/contractions.cuh>
#include <raft/linalg/contractions.hpp>

namespace raft {
namespace distance {
2 changes: 1 addition & 1 deletion cpp/include/raft/distance/detail/hellinger.cuh
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@

#pragma once
#include <raft/distance/detail/pairwise_distance_base.cuh>
#include <raft/linalg/unary_op.cuh>
#include <raft/linalg/unary_op.hpp>

namespace raft {
namespace distance {
4 changes: 2 additions & 2 deletions cpp/include/raft/distance/detail/pairwise_distance_base.cuh
Original file line number Diff line number Diff line change
@@ -16,8 +16,8 @@
#pragma once
#include <raft/cudart_utils.h>
#include <raft/cuda_utils.cuh>
#include <raft/linalg/contractions.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/linalg/contractions.hpp>
#include <raft/linalg/norm.hpp>
#include <raft/vectorized.cuh>

#include <cstddef>
2 changes: 1 addition & 1 deletion cpp/include/raft/label/classlabels.cuh
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@

#include <raft/cudart_utils.h>
#include <raft/cuda_utils.cuh>
#include <raft/linalg/unary_op.cuh>
#include <raft/linalg/unary_op.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

Original file line number Diff line number Diff line change
@@ -19,8 +19,8 @@
#include "detail/add.cuh"
#include "detail/functional.cuh"

#include "binary_op.cuh"
#include "unary_op.cuh"
#include "binary_op.hpp"
#include "unary_op.hpp"

namespace raft {
namespace linalg {
Original file line number Diff line number Diff line change
@@ -43,35 +43,7 @@ template <typename InType, typename Lambda, typename OutType = InType,
typename IdxType = int, int TPB = 256>
void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
Lambda op, cudaStream_t stream) {
constexpr auto maxSize =
sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
size_t bytes = len * maxSize;
uint64_t in1Addr = uint64_t(in1);
uint64_t in2Addr = uint64_t(in2);
uint64_t outAddr = uint64_t(out);
if (16 / maxSize && bytes % 16 == 0 &&
detail::addressAligned(in1Addr, in2Addr, outAddr, 16)) {
detail::binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (8 / maxSize && bytes % 8 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 8)) {
detail::binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (4 / maxSize && bytes % 4 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 4)) {
detail:: binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (2 / maxSize && bytes % 2 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 2)) {
detail::binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (1 / maxSize) {
detail::binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else {
detail::binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
op, stream);
}
detail::binaryOp(out, in1, in2, len, op, stream);
}

}; // end namespace linalg
Original file line number Diff line number Diff line change
@@ -57,7 +57,7 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N,
MainLambda main_op = raft::Nop<InType, IdxType>(),
ReduceLambda reduce_op = raft::Sum<OutType>(),
FinalLambda final_op = raft::Nop<OutType>()) {
detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op)
detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
}

}; // end namespace linalg
File renamed without changes.
2 changes: 2 additions & 0 deletions cpp/include/raft/linalg/detail/add.cuh
Original file line number Diff line number Diff line change
@@ -16,6 +16,8 @@

#pragma once

#include <raft/cuda_utils.cuh>

namespace raft {
namespace linalg {
namespace detail {
35 changes: 35 additions & 0 deletions cpp/include/raft/linalg/detail/binary_op.cuh
Original file line number Diff line number Diff line change
@@ -61,6 +61,41 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
}

template <typename InType, typename Lambda, typename OutType = InType,
typename IdxType = int, int TPB = 256>
void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
Lambda op, cudaStream_t stream) {
constexpr auto maxSize =
sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
size_t bytes = len * maxSize;
uint64_t in1Addr = uint64_t(in1);
uint64_t in2Addr = uint64_t(in2);
uint64_t outAddr = uint64_t(out);
if (16 / maxSize && bytes % 16 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 16)) {
binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (8 / maxSize && bytes % 8 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 8)) {
binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (4 / maxSize && bytes % 4 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 4)) {
binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (2 / maxSize && bytes % 2 == 0 &&
addressAligned(in1Addr, in2Addr, outAddr, 2)) {
binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else if (1 / maxSize) {
binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
out, in1, in2, len, op, stream);
} else {
binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
op, stream);
}
}

} // namespace detail
} // namespace linalg
} // namespace raft
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@
#include <raft/linalg/cusolver_wrappers.h>
#include <raft/cuda_utils.cuh>
#include <raft/handle.hpp>
#include <raft/linalg/binary_op.cuh>
#include <raft/linalg/binary_op.hpp>

namespace raft {
namespace linalg {
48 changes: 44 additions & 4 deletions cpp/include/raft/linalg/detail/eig.hpp
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@
#include <raft/linalg/cusolver_wrappers.h>
#include <raft/cuda_utils.cuh>
#include <raft/handle.hpp>
#include <raft/matrix/matrix.cuh>
#include <raft/matrix/matrix.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

@@ -30,9 +30,9 @@ namespace linalg {
namespace detail {

template <typename math_t>
void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
int n_cols, math_t *eig_vectors, math_t *eig_vals,
cudaStream_t stream) {
void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
math_t *eig_vals, cudaStream_t stream) {
cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();

int lwork;
@@ -51,10 +51,50 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
d_dev_info.data(), stream));
CUDA_CHECK(cudaGetLastError());

auto dev_info = d_dev_info.value(stream);
ASSERT(dev_info == 0,
"eig.cuh: eigensolver couldn't converge to a solution. "
"This usually occurs when some of the features do not vary enough.");
}

template <typename math_t>
void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals,
cudaStream_t stream) {
#if CUDART_VERSION < 11010
eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
#else
cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();

cusolverDnParams_t dn_params = nullptr;
CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params));

size_t workspaceDevice = 0;
size_t workspaceHost = 0;
CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(
cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
eig_vals, &workspaceDevice, &workspaceHost, stream));

rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
rmm::device_scalar<int> d_dev_info(stream);
std::vector<math_t> h_work(workspaceHost / sizeof(math_t));

raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);

CUSOLVER_CHECK(cusolverDnxsyevd(
cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost,
d_dev_info.data(), stream));

CUDA_CHECK(cudaGetLastError());
CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params));
int dev_info = d_dev_info.value(stream);
ASSERT(dev_info == 0,
"eig.cuh: eigensolver couldn't converge to a solution. "
"This usually occurs when some of the features do not vary enough.");
#endif
}

enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2020, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -23,25 +23,8 @@

namespace raft {
namespace linalg {
namespace detail {

/**
* @brief the wrapper of cublas gemm function
* It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
* @tparam math_t the type of input/output matrices
* @param handle raft handle
* @param a input matrix
* @param n_rows_a number of rows of A
* @param n_cols_a number of columns of A
* @param b input matrix
* @param c output matrix
* @param n_rows_c number of rows of C
* @param n_cols_c number of columns of C
* @param trans_a cublas transpose op for A
* @param trans_b cublas transpose op for B
* @param alpha scalar
* @param beta scalar
* @param stream cuda stream
*/
template <typename math_t>
void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
@@ -59,36 +42,6 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
b, ldb, &beta, c, ldc, stream));
}

template <typename math_t>
void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
cublasOperation_t trans_a, cublasOperation_t trans_b,
cudaStream_t stream) {
math_t alpha = math_t(1);
math_t beta = math_t(0);
gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
trans_b, alpha, beta, stream);
}

/**
* @brief A wrapper for CUBLS GEMM function designed for handling all possible
* combinations of operand layouts.
* It computes the following equation: Z = alpha . X * Y + beta . Z
* @tparam T Data type of input/output matrices (float/double)
* @param handle raft handle
* @param z output matrix of size M rows x N columns
* @param x input matrix of size M rows x K columns
* @param y input matrix of size K rows x N columns
* @param _M number of rows of X and Z
* @param _N number of rows of Y and columns of Z
* @param _K number of columns of X and rows of Y
* @param isZColMajor Storage layout of Z. true = col major, false = row major
* @param isXColMajor Storage layout of X. true = col major, false = row major
* @param isYColMajor Storage layout of Y. true = col major, false = row major
* @param stream cuda stream
* @param alpha scalar
* @param beta scalar
*/
template <typename T>
void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
@@ -166,5 +119,6 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
b, ldb, &beta, c, ldc, stream));
}

} // end namespace linalg
} // end namespace raft
} // namespace detail
} // namespace linalg
} // namespace raft
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.