diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index 5e194b8dd4..95103ab98e 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -16,13 +16,352 @@ #pragma once +#include + #include #include +#include +#include +#include +#include +#include +#include namespace raft { namespace matrix { namespace detail { +template +void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) +{ + auto d_src = in; + auto d_dest = out; + + raft::linalg::binaryOp( + d_dest, + d_src, + d_src, + len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, + stream); +} + +template +void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) +{ + power(inout, inout, scalar, len, stream); +} + +template +void power(math_t* inout, int len, cudaStream_t stream) +{ + math_t scalar = 1.0; + power(inout, scalar, len, stream); +} + +template +void power(math_t* in, math_t* out, int len, cudaStream_t stream) +{ + math_t scalar = 1.0; + power(in, out, scalar, len, stream); +} + +template +void seqRoot(math_t* in, + math_t* out, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool set_neg_zero = false) +{ + auto d_src = in; + auto d_dest = out; + + raft::linalg::unaryOp( + d_dest, + d_src, + len, + [=] __device__(math_t a) { + if (set_neg_zero) { + if (a < math_t(0)) { + return math_t(0); + } else { + return sqrt(a * scalar); + } + } else { + return sqrt(a * scalar); + } + }, + stream); +} + +template +void seqRoot( + math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) +{ + seqRoot(inout, inout, scalar, len, stream, set_neg_zero); +} + +template +void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ + math_t scalar = 1.0; + seqRoot(in, out, scalar, len, stream); +} + +template +void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) +{ + math_t scalar = 1.0; + seqRoot(inout, inout, scalar, len, stream); +} + +template +void setSmallValuesZero( + math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ + raft::linalg::unaryOp( + out, + in, + len, + [=] __device__(math_t a) { + if (a <= thres && -a <= thres) { + return math_t(0); + } else { + return a; + } + }, + stream); +} + +template +void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ + setSmallValuesZero(inout, inout, len, stream, thres); +} + +template +void reciprocal(math_t* in, + math_t* out, + math_t scalar, + int len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ + auto d_src = in; + auto d_dest = out; + + raft::linalg::unaryOp( + d_dest, + d_src, + len, + [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; }, + stream); +} + +template +void reciprocal(math_t* inout, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ + reciprocal(inout, inout, scalar, len, stream, setzero, thres); +} + +template +void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) +{ + math_t scalar = 1.0; + reciprocal(inout, scalar, len, stream); +} + +template +void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ + math_t scalar = 1.0; + reciprocal(in, out, scalar, len, stream); +} + +template +void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) +{ + raft::linalg::unaryOp( + out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); +} + +template +void ratio( + const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) +{ + auto d_src = src; + auto d_dest = dest; + + rmm::device_scalar d_sum(stream); + auto* d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; + raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); + raft::linalg::unaryOp( + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); +} + +template +void matrixVectorBinaryMult(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, + stream); +} + +template +void matrixVectorBinaryMultSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { + if (b == Type(0)) + return a; + else + return a * b; + }, + stream); +} + +template +void matrixVectorBinaryDiv(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, + stream); +} + +template +void matrixVectorBinaryDivSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream, + bool return_zero = false) +{ + if (return_zero) { + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { + if (raft::myAbs(b) < Type(1e-10)) + return Type(0); + else + return a / b; + }, + stream); + } else { + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { + if (raft::myAbs(b) < Type(1e-10)) + return a; + else + return a / b; + }, + stream); + } +} + +template +void matrixVectorBinaryAdd(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); +} + +template +void matrixVectorBinarySub(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); +} + // Computes the argmax(d_in) column-wise in a DxN matrix template __global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index cf908c5e6d..f9cfffe64d 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -23,6 +23,14 @@ #include +#include +#include +#include +#include +#include +#include +#include + namespace raft { namespace matrix { namespace detail { @@ -56,6 +64,98 @@ void copyRows(const m_t* in, }); } +template +void truncZeroOrigin( + m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) +{ + auto m = out_n_rows; + auto k = in_n_rows; + idx_t size = out_n_rows * out_n_cols; + auto d_q = in; + auto d_q_trunc = out; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % m; + idx_t col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); +} + +template +void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto n = n_cols; + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; + auto d_q_reversed = inout; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each( + rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = dest_row; + idx_t src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); +} + +template +void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; + auto d_q_reversed = inout; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each( + rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = (m - dest_row) - 1; + ; + idx_t src_col = dest_col; + + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); +} + +template +void print(const m_t* in, + idx_t n_rows, + idx_t n_cols, + char h_separator = ' ', + char v_separator = '\n', + cudaStream_t stream = rmm::cuda_stream_default) +{ + std::vector h_matrix = std::vector(n_cols * n_rows); + raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream); + + for (idx_t i = 0; i < n_rows; i++) { + for (idx_t j = 0; j < n_cols; j++) { + printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); + } + } +} + +template +void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) +{ + for (idx_t i = 0; i < n_rows; i++) { + for (idx_t j = 0; j < n_cols; j++) { + printf("%1.4f ", in[j * n_rows + i]); + } + printf("\n"); + } +} + /** * @brief Kernel for copying a slice of a big matrix to a small matrix with a * size matches that slice @@ -173,6 +273,15 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) matrixDiagonalInverse<<>>(in, len); } +template +m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) +{ + cublasHandle_t cublasH = handle.get_cublas_handle(); + m_t normval = 0; + RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + return normval; +} + } // end namespace detail } // end namespace matrix } // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp index 8639cdfb02..619e20a702 100644 --- a/cpp/include/raft/matrix/math.hpp +++ b/cpp/include/raft/matrix/math.hpp @@ -18,14 +18,6 @@ #include "detail/math.cuh" -#include -#include -#include -#include -#include -#include -#include - namespace raft { namespace matrix { @@ -45,16 +37,7 @@ namespace matrix { template void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) { - auto d_src = in; - auto d_dest = out; - - raft::linalg::binaryOp( - d_dest, - d_src, - d_src, - len, - [=] __device__(math_t a, math_t b) { return scalar * a * b; }, - stream); + detail::power(in, out, scalar, len, stream); } /** @@ -67,7 +50,7 @@ void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) template void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) { - power(inout, inout, scalar, len, stream); + detail::power(inout, scalar, len, stream); } /** @@ -79,8 +62,7 @@ void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) template void power(math_t* inout, int len, cudaStream_t stream) { - math_t scalar = 1.0; - power(inout, scalar, len, stream); + detail::power(inout, len, stream); } /** @@ -94,8 +76,7 @@ void power(math_t* inout, int len, cudaStream_t stream) template void power(math_t* in, math_t* out, int len, cudaStream_t stream) { - math_t scalar = 1.0; - power(in, out, scalar, len, stream); + detail::power(in, out, len, stream); } /** @@ -117,25 +98,7 @@ void seqRoot(math_t* in, cudaStream_t stream, bool set_neg_zero = false) { - auto d_src = in; - auto d_dest = out; - - raft::linalg::unaryOp( - d_dest, - d_src, - len, - [=] __device__(math_t a) { - if (set_neg_zero) { - if (a < math_t(0)) { - return math_t(0); - } else { - return sqrt(a * scalar); - } - } else { - return sqrt(a * scalar); - } - }, - stream); + detail::seqRoot(in, out, scalar, len, stream, set_neg_zero); } /** @@ -152,7 +115,7 @@ template void seqRoot( math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) { - seqRoot(inout, inout, scalar, len, stream, set_neg_zero); + detail::seqRoot(inout, scalar, len, stream, set_neg_zero); } /** @@ -167,33 +130,38 @@ void seqRoot( template void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) { - math_t scalar = 1.0; - seqRoot(in, out, scalar, len, stream); + detail::seqRoot(in, out, len, stream); } +/** + * @brief Square root of every element in the input matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param inout: input matrix with in-place results + * @param len: number elements of input matrix + * @param stream cuda stream + */ template void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) { - math_t scalar = 1.0; - seqRoot(inout, inout, scalar, len, stream); + detail::seqRoot(inout, len, stream); } +/** + * @brief sets the small values to zero based on a defined threshold + * @tparam math_t data-type upon which the math operation will be performed + * @tparam IdxType Integer type used to for addressing + * @param out: output matrix. The result is stored in the out matrix + * @param in: input matrix + * @param len: number elements of input matrix + * @param stream cuda stream + * @param thres threshold to set values to zero + */ template void setSmallValuesZero( math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) { - raft::linalg::unaryOp( - out, - in, - len, - [=] __device__(math_t a) { - if (a <= thres && -a <= thres) { - return math_t(0); - } else { - return a; - } - }, - stream); + detail::setSmallValuesZero(out, in, len, stream, thres); } /** @@ -208,7 +176,7 @@ void setSmallValuesZero( template void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) { - setSmallValuesZero(inout, inout, len, stream, thres); + detail::setSmallValuesZero(inout, len, stream, thres); } /** @@ -233,27 +201,20 @@ void reciprocal(math_t* in, bool setzero = false, math_t thres = 1e-15) { - auto d_src = in; - auto d_dest = out; - - raft::linalg::unaryOp( - d_dest, - d_src, - len, - [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; }, - stream); + detail::reciprocal(in, out, scalar, len, stream, setzero, thres); } /** * @brief Reciprocal of every element in the input matrix * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing - * @param inout: input matrix and also the result is stored + * @param inout: input matrix with in-place results * @param scalar: every element is multiplied with scalar * @param len: number elements of input matrix * @param stream cuda stream - * @param setzero: (default false) when true and |value| result = 0) + * @param setzero round down to zero if the input is less the threshold + * @param thres the threshold used to forcibly set inputs to zero + * @{ */ template void reciprocal(math_t* inout, @@ -263,7 +224,7 @@ void reciprocal(math_t* inout, bool setzero = false, math_t thres = 1e-15) { - reciprocal(inout, inout, scalar, len, stream, setzero, thres); + detail::reciprocal(inout, scalar, len, stream, setzero, thres); } /** @@ -277,8 +238,7 @@ void reciprocal(math_t* inout, template void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) { - math_t scalar = 1.0; - reciprocal(inout, scalar, len, stream); + detail::reciprocal(inout, len, stream); } /** @@ -293,15 +253,22 @@ void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) template void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) { - math_t scalar = 1.0; - reciprocal(in, out, scalar, len, stream); + detail::reciprocal(in, out, len, stream); } +/** + * @brief set values to scalar in matrix + * @tparam math_t data-type upon which the math operation will be performed + * @param out output matrix. The result is stored in the out matrix + * @param in input matrix + * @param scalar svalar value + * @param len number elements of input matrix + * @param stream cuda stream + */ template void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) { - raft::linalg::unaryOp( - out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); + detail::setValue(out, in, scalar, len, stream); } /** @@ -318,15 +285,7 @@ template void ratio( const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) { - auto d_src = src; - auto d_dest = dest; - - rmm::device_scalar d_sum(stream); - auto* d_sum_ptr = d_sum.data(); - auto no_op = [] __device__(math_t in) { return in; }; - raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); - raft::linalg::unaryOp( - d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); + detail::ratio(handle, src, dest, len, stream); } /** @} */ @@ -359,6 +318,16 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) detail::signFlip(inout, n_rows, n_cols, stream); } +/** + * @brief multiply each row or column of matrix with vector + * @param data input matrix, results are in-place + * @param vec input vector + * @param n_row number of rows of input matrix + * @param n_col number of columns of input matrix + * @param rowMajor whether matrix is row major + * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns + * @param stream cuda stream + */ template void matrixVectorBinaryMult(Type* data, const Type* vec, @@ -368,18 +337,20 @@ void matrixVectorBinaryMult(Type* data, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a * b; }, - stream); + detail::matrixVectorBinaryMult( + data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream); } +/** + * @brief multiply each row or column of matrix with vector, skipping zeros in vector + * @param data input matrix, results are in-place + * @param vec input vector + * @param n_row number of rows of input matrix + * @param n_col number of columns of input matrix + * @param rowMajor whether matrix is row major + * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns + * @param stream cuda stream + */ template void matrixVectorBinaryMultSkipZero(Type* data, const Type* vec, @@ -389,23 +360,20 @@ void matrixVectorBinaryMultSkipZero(Type* data, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { - if (b == Type(0)) - return a; - else - return a * b; - }, - stream); + detail::matrixVectorBinaryMultSkipZero( + data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream); } +/** + * @brief divide each row or column of matrix with vector + * @param data input matrix, results are in-place + * @param vec input vector + * @param n_row number of rows of input matrix + * @param n_col number of columns of input matrix + * @param rowMajor whether matrix is row major + * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns + * @param stream cuda stream + */ template void matrixVectorBinaryDiv(Type* data, const Type* vec, @@ -415,18 +383,22 @@ void matrixVectorBinaryDiv(Type* data, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a / b; }, - stream); + detail::matrixVectorBinaryDiv( + data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream); } +/** + * @brief divide each row or column of matrix with vector, skipping zeros in vector + * @param data input matrix, results are in-place + * @param vec input vector + * @param n_row number of rows of input matrix + * @param n_col number of columns of input matrix + * @param rowMajor whether matrix is row major + * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns + * @param stream cuda stream + * @param return_zero result is zero if true and vector value is below threshold, original value if + * false + */ template void matrixVectorBinaryDivSkipZero(Type* data, const Type* vec, @@ -437,41 +409,20 @@ void matrixVectorBinaryDivSkipZero(Type* data, cudaStream_t stream, bool return_zero = false) { - if (return_zero) { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { - if (raft::myAbs(b) < Type(1e-10)) - return Type(0); - else - return a / b; - }, - stream); - } else { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { - if (raft::myAbs(b) < Type(1e-10)) - return a; - else - return a / b; - }, - stream); - } + detail::matrixVectorBinaryDivSkipZero( + data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero); } +/** + * @brief add each row or column of matrix with vector + * @param data input matrix, results are in-place + * @param vec input vector + * @param n_row number of rows of input matrix + * @param n_col number of columns of input matrix + * @param rowMajor whether matrix is row major + * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns + * @param stream cuda stream + */ template void matrixVectorBinaryAdd(Type* data, const Type* vec, @@ -481,18 +432,20 @@ void matrixVectorBinaryAdd(Type* data, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, - stream); + detail::matrixVectorBinaryAdd( + data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream); } +/** + * @brief subtract each row or column of matrix with vector + * @param data input matrix, results are in-place + * @param vec input vector + * @param n_row number of rows of input matrix + * @param n_col number of columns of input matrix + * @param rowMajor whether matrix is row major + * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns + * @param stream cuda stream + */ template void matrixVectorBinarySub(Type* data, const Type* vec, @@ -502,16 +455,8 @@ void matrixVectorBinarySub(Type* data, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, - stream); + detail::matrixVectorBinarySub( + data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream); } }; // end namespace matrix diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index a89c28ab80..d3d98cb872 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -18,14 +18,6 @@ #include "detail/matrix.cuh" -#include -#include -#include -#include -#include -#include -#include - namespace raft { namespace matrix { @@ -87,18 +79,7 @@ template void truncZeroOrigin( m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) { - auto m = out_n_rows; - auto k = in_n_rows; - idx_t size = out_n_rows * out_n_cols; - auto d_q = in; - auto d_q_trunc = out; - auto counting = thrust::make_counting_iterator(0); - - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { - idx_t row = idx % m; - idx_t col = idx / m; - d_q_trunc[col * m + row] = d_q[col * k + row]; - }); + detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream); } /** @@ -112,23 +93,7 @@ void truncZeroOrigin( template void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto n = n_cols; - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; - auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); - - thrust::for_each( - rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = dest_row; - idx_t src_col = (n - dest_col) - 1; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + detail::colReverse(inout, n_rows, n_cols, stream); } /** @@ -142,24 +107,7 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) template void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; - auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); - - thrust::for_each( - rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = (m - dest_row) - 1; - ; - idx_t src_col = dest_col; - - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + detail::rowReverse(inout, n_rows, n_cols, stream); } /** @@ -179,14 +127,7 @@ void print(const m_t* in, char v_separator = '\n', cudaStream_t stream = rmm::cuda_stream_default) { - std::vector h_matrix = std::vector(n_cols * n_rows); - raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream); - - for (idx_t i = 0; i < n_rows; i++) { - for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); - } - } + detail::print(in, n_rows, n_cols, h_separator, v_separator, stream); } /** @@ -198,12 +139,7 @@ void print(const m_t* in, template void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) { - for (idx_t i = 0; i < n_rows; i++) { - for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f ", in[j * n_rows + i]); - } - printf("\n"); - } + detail::printHost(in, n_rows, n_cols); } /** @@ -284,10 +220,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) template m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) { - cublasHandle_t cublasH = handle.get_cublas_handle(); - m_t normval = 0; - RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); - return normval; + return detail::getL2Norm(handle, in, size, stream); } }; // end namespace matrix diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh new file mode 100644 index 0000000000..1a4fc20c51 --- /dev/null +++ b/cpp/include/raft/stats/detail/mean_center.cuh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace stats { +namespace detail { + +/** + * @brief Center the input matrix wrt its mean + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param out the output mean-centered matrix + * @param data input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param rowMajor whether input is row or col major + * @param bcastAlongRows whether to broadcast vector along rows or columns + * @param stream cuda stream where to launch work + */ +template +void meanCenter(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); +} + +/** + * @brief Add the input matrix wrt its mean + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads per block of the cuda kernel launched + * @param out the output mean-added matrix + * @param data input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param rowMajor whether input is row or col major + * @param bcastAlongRows whether to broadcast vector along rows or columns + * @param stream cuda stream where to launch work + */ +template +void meanAdd(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ + raft::linalg::matrixVectorOp( + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); +} + +}; // end namespace detail +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp index c0ba24312b..406a0b5047 100644 --- a/cpp/include/raft/stats/mean_center.hpp +++ b/cpp/include/raft/stats/mean_center.hpp @@ -16,9 +16,7 @@ #pragma once -#include -#include -#include +#include "detail/mean_center.cuh" namespace raft { namespace stats { @@ -47,16 +45,7 @@ void meanCenter(Type* out, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - out, - data, - mu, - D, - N, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, - stream); + detail::meanCenter(out, data, mu, D, N, rowMajor, bcastAlongRows, stream); } /** @@ -83,16 +72,7 @@ void meanAdd(Type* out, bool bcastAlongRows, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - out, - data, - mu, - D, - N, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, - stream); + detail::meanAdd(out, data, mu, D, N, rowMajor, bcastAlongRows, stream); } }; // end namespace stats