diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9768dc266c..cf4d0ed55e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -317,8 +317,6 @@ if(RAFT_COMPILE_DIST_LIBRARY) src/distance/cluster/kmeans_init_plus_plus_float.cu src/distance/distance/specializations/detail/canberra_double_double_double_int.cu src/distance/distance/specializations/detail/canberra_float_float_float_int.cu - src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu - src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu src/distance/distance/specializations/detail/correlation_double_double_double_int.cu src/distance/distance/specializations/detail/correlation_float_float_float_int.cu src/distance/distance/specializations/detail/cosine_double_double_double_int.cu @@ -352,6 +350,8 @@ if(RAFT_COMPILE_DIST_LIBRARY) src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu + src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu + src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh deleted file mode 100644 index 88add4c2b0..0000000000 --- a/cpp/include/raft/distance/detail/canberra.cuh +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the canberra distance matrix calculation implementer - * It computes the following equation: cij = max(cij, op(ai-bj)) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and cols of C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param fin_op the final gemm epilogue lambda - * @param stream cuda stream to launch work - */ -template -static void canberraImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::abs(x - y); - const auto add = raft::abs(x) + raft::abs(y); - // deal with potential for 0 in denominator by - // forcing 1/0 instead - acc += ((add != 0) * diff / (add + (add == 0))); - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = raft::void_op(); - - if constexpr (isRowMajor) { - auto canberraRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); - - canberraRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto canberraColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); - canberraColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void canberra(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - canberraImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); -} - -/** - * @brief the canberra distance matrix calculation - * It computes the following equation: cij = max(cij, op(ai-bj)) - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and cols of C/D - * @param[in] k number of cols of A and B - * @param[in] pA input matrix - * @param[in] pB input matrix - * @param[out] pD output matrix - * @param[in] fin_op the final element-wise epilogue lambda - * @param[in] stream cuda stream to launch work - * @param[in] isRowMajor whether the input and output matrices are row major - */ -template -void canberraImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type canberraOutType; - Index_ lda, ldb, ldd; - canberraOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - canberra( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - } else { - lda = n, ldb = m, ldd = m; - canberra( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} - -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh deleted file mode 100644 index 43b36e7921..0000000000 --- a/cpp/include/raft/distance/detail/chebyshev.cuh +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include - -namespace raft { -namespace distance { -namespace detail { -/** - * @brief the Chebyshev distance matrix calculation implementer - * It computes the following equation: cij = max(cij, op(ai-bj)) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and cols of C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[out] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void chebyshevImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::abs(x - y); - acc = raft::max(acc, diff); - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = raft::void_op(); - - if (isRowMajor) { - auto chebyshevRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevRowMajor); - - chebyshevRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto chebyshevColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevColMajor); - chebyshevColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void chebyshev(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - chebyshevImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - chebyshevImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - chebyshevImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the chebyshev distance matrix calculation - * It computes the following equation: cij = max(cij, op(ai-bj)) - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and cols of C/D - * @param[in] k number of cols of A and B - * @param[in] pA input matrix - * @param[in] pB input matrix - * @param[out] pD output matrix - * @param[in] fin_op the final element-wise epilogue lambda - * @param[in] stream cuda stream to launch work - * @param[in] isRowMajor whether the input and output matrices are row major - */ -template -void chebyshevImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type chebyshevOutType; - Index_ lda, ldb, ldd; - chebyshevOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - chebyshev( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - } else { - lda = n, ldb = m, ldd = m; - chebyshev( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh deleted file mode 100644 index f7fe3678e6..0000000000 --- a/cpp/include/raft/distance/detail/correlation.cuh +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the Correlation distance matrix: - * - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void correlationImpl(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - const DataT* x2n, - const DataT* y2n, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [x2n, y2n, m, n, k] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { - DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh]; - - extern __shared__ char smem[]; - DataT* sx2Norm = - (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]); - DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]); - - // Load x & y norms required by this threadblock in shmem buffer - if (gridStrideX == blockIdx.x * KPolicy::Nblk) { - for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) { - auto idx = gridStrideY + i; - sx2Norm[i] = idx < m ? x2n[idx] : 0; - } - } - - for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) { - auto idx = gridStrideX + i; - sy2Norm[i] = idx < n ? y2n[idx] : 0; - } - __syncthreads(); - -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { - regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)]; - } -#pragma unroll - for (int i = 0; i < KPolicy::AccColsPerTh; ++i) { - regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)]; - } - -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); - auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); - auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); - - acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom)); - } - } - }; - - constexpr size_t shmemSize = - KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); - if (isRowMajor) { - constexpr auto correlationRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, correlationRowMajor); - correlationRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - constexpr auto correlationColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, correlationColMajor); - correlationColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void correlation(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - const DataT* x2n, - const DataT* y2n, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - correlationImpl( - x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - correlationImpl( - x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - correlationImpl( - x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the Correlation distance matrix calculation - * - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void correlationImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - AccType* workspace, - size_t& worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type correlationOutType; - Index_ lda, ldb, ldd; - correlationOutType* pDcast = reinterpret_cast(pD); - - ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) || - (worksize < 2 * m * sizeof(AccType))), - "workspace size error"); - ASSERT(workspace != nullptr, "workspace is null"); - - AccType* norm_col_vec = workspace; - AccType* norm_row_vec = workspace; - AccType* sq_norm_col_vec = workspace; - AccType* sq_norm_row_vec = workspace; - if (pA != pB) { - norm_row_vec += m; - - raft::linalg::reduce(norm_col_vec, - pA, - k, - m, - (AccType)0, - isRowMajor, - true, - stream, - false, - raft::identity_op(), - raft::add_op()); - raft::linalg::reduce(norm_row_vec, - pB, - k, - n, - (AccType)0, - isRowMajor, - true, - stream, - false, - raft::identity_op(), - raft::add_op()); - - sq_norm_col_vec += (m + n); - sq_norm_row_vec = sq_norm_col_vec + m; - raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream); - raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream); - } else { - raft::linalg::reduce(norm_col_vec, - pA, - k, - m, - (AccType)0, - isRowMajor, - true, - stream, - false, - raft::identity_op(), - raft::add_op()); - sq_norm_col_vec += m; - sq_norm_row_vec = sq_norm_col_vec; - raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream); - } - - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - correlation(m, - n, - k, - lda, - ldb, - ldd, - pA, - pB, - norm_col_vec, - norm_row_vec, - sq_norm_col_vec, - sq_norm_row_vec, - pDcast, - fin_op, - stream); - } else { - lda = n, ldb = m, ldd = m; - correlation(n, - m, - k, - lda, - ldb, - ldd, - pB, - pA, - norm_row_vec, - norm_col_vec, - sq_norm_row_vec, - sq_norm_col_vec, - pDcast, - fin_op, - stream); - } -} - -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh deleted file mode 100644 index 46a694aa51..0000000000 --- a/cpp/include/raft/distance/detail/cosine.cuh +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace raft { -namespace distance { -namespace detail { - -template -struct CosineOp { - __device__ CosineOp() noexcept {} - __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept - { - return static_cast(1.0) - (AccT)(accVal / (aNorm * bNorm)); - } - __device__ AccT operator()(DataT aData) const noexcept { return aData; } -}; - -/** - * @brief the cosine distance matrix calculation implementer - * It computes the following equation: - * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread for every LDG call - * it makes. check contractions.cuh for details. - * @tparam FinalLambda the final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] xn row norms of input matrix A. - * @param[in] yn row norms of input matrix B. - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] pD output matrix - * @param fin_op the final gemm epilogue lambda -* @param stream cuda stream to launch cuda operations. - */ -template -void cosineImpl(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ -#if (__CUDACC_VER_MAJOR__ < 12) - const auto deviceVersion = getComputeCapability(); - if (deviceVersion.first >= 8) { - using CosineOp_ = CosineOp; - CosineOp_ cosine_dist_op; - - cutlassDistanceKernel( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, cosine_dist_op, stream); - - } else -#endif - { - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j])); - } - } - }; - - constexpr size_t shmemSize = - KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); - if (isRowMajor) { - auto cosineRowMajor = pairwiseDistanceMatKernelPriorToAmpere; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); - cosineRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto cosineColMajor = pairwiseDistanceMatKernelPriorToAmpere; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); - cosineColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void cosine(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - cosineImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - cosineImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - cosineImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the expanded cosine distance matrix calculation - * It computes the following equation: - * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) - * @tparam IType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OType output data-type (for C and D matrices) - * @tparam OutputTile_ output tile size for the thread block - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @tparam in_params user-defined input parameter - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param fin_op the final gemm epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void cosineAlgo1(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - AccType* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - // raft distance support inputs as float/double and output as uint8_t/float/double. - static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))), - "OutType can be uint8_t, float, double," - "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType)."); - typedef typename std::conditional::type CosOutType; - CosOutType* pDcast = reinterpret_cast(pD); - - ASSERT( - !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), - "workspace size error"); - ASSERT(workspace != nullptr, "workspace is null"); - - Index_ lda, ldb, ldd; - InType* col_vec = workspace; - InType* row_vec = workspace; - if (pA != pB) { - row_vec += m; - raft::linalg::rowNorm( - col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{}); - raft::linalg::rowNorm( - row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{}); - } else { - raft::linalg::rowNorm( - col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{}); - } - - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - cosine( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, fin_op, stream); - } else { - lda = n, ldb = m, ldd = m; - cosine( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, fin_op, stream); - } -} - -}; // end namespace detail -}; // end namespace distance -}; // end namespace raft diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index d2d2b40283..7887eb96be 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -17,19 +17,32 @@ #pragma once #include +#include + #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + #include #include #include @@ -39,594 +52,673 @@ namespace raft { namespace distance { namespace detail { -/** enum to tell how to compute distance */ -enum DistanceType : unsigned short { - - /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */ - L2Expanded = 0, - /** same as above, but inside the epilogue, perform square root operation */ - L2SqrtExpanded = 1, - /** cosine distance */ - CosineExpanded = 2, - /** L1 distance */ - L1 = 3, - /** evaluate as dist_ij += (x_ik - y-jk)^2 */ - L2Unexpanded = 4, - /** same as above, but inside the epilogue, perform square root operation */ - L2SqrtUnexpanded = 5, - /** basic inner product **/ - InnerProduct = 6, - /** Chebyshev (Linf) distance **/ - Linf = 7, - /** Canberra distance **/ - Canberra = 8, - /** Generalized Minkowski distance **/ - LpUnexpanded = 9, - /** Correlation distance **/ - CorrelationExpanded = 10, - /** Jaccard distance **/ - JaccardExpanded = 11, - /** Hellinger distance **/ - HellingerExpanded = 12, - /** Haversine distance **/ - Haversine = 13, - /** Bray-Curtis distance **/ - BrayCurtis = 14, - /** Jensen-Shannon distance**/ - JensenShannon = 15, - /** Hamming distance **/ - HammingUnexpanded = 16, - /** KLDivergence **/ - KLDivergence = 17, - /** RusselRao **/ - RusselRaoExpanded = 18, - /** Dice-Sorensen distance **/ - DiceExpanded = 19, - /** Precomputed (special value) **/ - Precomputed = 100 -}; - -namespace { -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor, - InType metric_arg = 2.0f) - { - } -}; +/** + * @brief: A tag type for overload resolution based on DistanceType + * + * It is not possible to partially specialize function templates on a single + * parameter. Instead, it is often easier to use a combination of conventional + * method overloading and a parameter with a specific tag type. The following + * type is used to help method overloading based on the DistanceType enum. + */ +template +using distance_tag = std::integral_constant; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::euclideanAlgo1( - m, - n, - k, - x, - y, - dist, - false, - (AccType*)workspace, - worksize, - fin_op, - raft::resource::get_cuda_stream(handle), - isRowMajor); - } -}; +/** + * @brief Implement pairwise_matrix for specific distance + * + * There are multiple overloads for this function, one for each distance type. + * They are implemented below. The documentation of this function serves as + * documentation for all functions. The following overloads are defined: + * + * - DistanceType::Canberra: + * - DistanceType::CorrelationExpanded: + * - DistanceType::CosineExpanded: + * - DistanceType::HammingUnexpanded: + * - DistanceType::HellingerExpanded: + * - DistanceType::JensenShannon: + * - DistanceType::KLDivergence: + * - DistanceType::L1: + * - DistanceType::L2Expanded: + * - DistanceType::L2SqrtExpanded: + * - DistanceType::L2Unexpanded: + * - DistanceType::L2SqrtUnexpanded: + * - DistanceType::Linf: + * - DistanceType::LpUnexpanded: + * - DistanceType::RusselRaoExpanded: + * + * @tparam DataT Input data type + * @tparam AccT Accumulation data type + * @tparam OutT Output data type + * @tparam FinOpT Type of final operation + * @tparam IdxT Index type + * + * @param handle RAFT resources handle + * @param distance_type A tag type to indicate which distance is calculated. + * @param x First set of points + * @param y Second set of points + * @param out Output distance matrix + * @param m Number of points in x + * @param n Number of points in y + * @param k Dimensionality of points in x, y + * @param workspace Temporary workspace needed for computations + * @param worksize Number of bytes of the workspace + * @param is_row_major Whether the matrices are row-major or col-major + * @param metric_arg The `p` argument for Lp. + */ +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, // unused + size_t worksize, // unused + FinOpT fin_op, + bool is_row_major, + DataT metric_arg) // unused +{ + ops::canberra_distance_op distance_op{}; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::euclideanAlgo1( - m, - n, - k, - x, - y, - dist, - true, - (AccType*)workspace, - worksize, - fin_op, - raft::resource::get_cuda_stream(handle), - isRowMajor); - } -}; + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::cosineAlgo1( - m, - n, - k, - x, - y, - dist, - (AccType*)workspace, - worksize, - fin_op, - raft::resource::get_cuda_stream(handle), - isRowMajor); - } -}; + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda, - bool isRowMajor, - InType) - { - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - raft::linalg::gemm(handle, - dist, - const_cast(x), - const_cast(y), - m, - n, - k, - !isRowMajor, - !isRowMajor, - isRowMajor, - stream); +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, + size_t worksize, + FinOpT fin_op, + bool is_row_major, + DataT) // unused +{ + ASSERT( + !(((x != y) && (worksize < 2 * (m + n) * sizeof(AccT))) || (worksize < 2 * m * sizeof(AccT))), + "workspace size error"); + ASSERT(workspace != nullptr, "workspace is null"); + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + AccT* norm_col_vec = workspace; + AccT* norm_row_vec = workspace; + AccT* sq_norm_col_vec = workspace; + AccT* sq_norm_row_vec = workspace; + if (x != y) { + norm_row_vec += m; + + raft::linalg::reduce(norm_col_vec, + x, + k, + m, + (AccT)0, + is_row_major, + true, + stream, + false, + raft::identity_op(), + raft::add_op()); + raft::linalg::reduce(norm_row_vec, + y, + k, + n, + (AccT)0, + is_row_major, + true, + stream, + false, + raft::identity_op(), + raft::add_op()); + + sq_norm_col_vec += (m + n); + sq_norm_row_vec = sq_norm_col_vec + m; + raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream); + raft::linalg::rowNorm(sq_norm_row_vec, y, k, n, raft::linalg::L2Norm, is_row_major, stream); + } else { + raft::linalg::reduce(norm_col_vec, + x, + k, + m, + (AccT)0, + is_row_major, + true, + stream, + false, + raft::identity_op(), + raft::add_op()); + sq_norm_col_vec += m; + sq_norm_row_vec = sq_norm_col_vec; + raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream); } -}; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::euclideanAlgo2( - m, n, k, x, y, dist, false, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); - } -}; + using OpT = ops::correlation_distance_op; + OpT corr_op(is_row_major, sq_norm_col_vec, sq_norm_row_vec, m, n, k); + distance_matrix_dispatch( + corr_op, m, n, k, x, y, norm_col_vec, norm_row_vec, out, fin_op, stream, is_row_major); +} -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::euclideanAlgo2( - m, n, k, x, y, dist, true, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, + size_t worksize, + FinOpT fin_op, + bool is_row_major, + DataT) // unused +{ + // raft distance support inputs as float/double and output as uint8_t/float/double. + static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))), + "OutT can be uint8_t, float, double," + "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT)."); + + ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))), + "workspace size error"); + ASSERT(workspace != nullptr, "workspace is null"); + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + DataT* norm_A = workspace; + DataT* norm_B = workspace; + if (x != y) { + norm_B += m; + raft::linalg::rowNorm( + norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{}); + raft::linalg::rowNorm( + norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{}); + } else { + raft::linalg::rowNorm( + norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{}); } -}; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::l1Impl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); + // On CUDA 12: + // - always execute normal kernel + // + // On CUDA 11 and below: + // - execute CUTLASS-based kernel on SM_80 and above + // - execute normal kernel otherwise. + + if constexpr (__CUDACC_VER_MAJOR__ == 12) { + // Always execute legacy kernels on CUDA 12 + ops::cosine_distance_op distance_op{}; + distance_matrix_dispatch( + distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); + } else { + const auto deviceVersion = getComputeCapability(); + if (deviceVersion.first >= 8) { + // If device is SM_80 or later, use CUTLASS-based kernel. + using Op = ops::cosine_cutlass_op; + Op distance_op{}; + + distance_matrix_cutlass_dispatch( + distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); + } else { + // Else use "legacy" cosine kernel + ops::cosine_distance_op distance_op{}; + distance_matrix_dispatch( + distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); + } } -}; +} -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::chebyshevImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); - } -}; +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + ops::hamming_distance_op distance_op{k}; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::hellingerImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); - } -}; + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType metric_arg) - { - raft::distance::detail::minkowskiImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor, metric_arg); - } -}; + cudaStream_t stream = raft::resource::get_cuda_stream(handle); -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::canberraImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); - } -}; + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::hammingUnexpandedImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); - } -}; +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + raft::linalg::gemm(handle, + out, + const_cast(x), + const_cast(y), + m, + n, + k, + !is_row_major, + !is_row_major, + is_row_major, + stream); +} -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::jensenShannonImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // First sqrt x and y + const auto raft_sqrt = raft::linalg::unaryOp; + + raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream); + if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); } + + // Then calculate Hellinger distance + ops::hellinger_distance_op distance_op{}; + + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); + + // Finally revert sqrt of x and y + raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream); + if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); } + + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + ops::jensen_shannon_distance_op distance_op{}; + + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + auto unaryOp_lambda = [] __device__(DataT input) { + const bool x_zero = (input == 0); + return (!x_zero) * raft::log(input + x_zero); + }; + + auto unaryOp_lambda_reverse = [] __device__(DataT input) { + // reverse previous log (x) back to x using (e ^ log(x)) + const bool x_zero = (input == 0); + return (!x_zero) * raft::exp(input); + }; + + // This op takes some shortcuts when x equals y. So its behavior changes based + // on this. + ops::kl_divergence_op kl_divergence{is_row_major, x == y}; + + if (x != y) { + raft::linalg::unaryOp( + (DataT*)y, y, n * k, unaryOp_lambda, stream); } -}; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::russellRaoImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + distance_matrix_dispatch( + kl_divergence, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); + + if (x != y) { + // Now reverse previous log (x) back to x using (e ^ log(x)) + raft::linalg::unaryOp( + (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream); } -}; +} -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void*, - size_t, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::klDivergenceImpl( - m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor); +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + ops::l1_distance_op distance_op{}; + + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} + +template +void distance_impl_l2_expanded( // NOTE: different name + bool perform_sqrt, // dispatch on sqrt + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, + size_t worksize, + FinOpT fin_op, + cudaStream_t stream, + bool is_row_major) +{ + // raft distance support inputs as float/double and output as uint8_t/float/double. + static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))), + "OutT can be uint8_t, float, double," + "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT)."); + + ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))), + "workspace size error"); + ASSERT(workspace != nullptr, "workspace is null"); + + DataT* norm_A = workspace; + DataT* norm_B = workspace; + if (x != y) { + norm_B += m; + raft::linalg::rowNorm( + norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{}); + raft::linalg::rowNorm( + norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{}); + } else { + raft::linalg::rowNorm( + norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{}); } -}; -template -struct DistanceImpl { - void run(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor, - InType) - { - raft::distance::detail::correlationImpl( - m, - n, - k, - x, - y, - dist, - (AccType*)workspace, - worksize, - fin_op, - raft::resource::get_cuda_stream(handle), - isRowMajor); + // On CUDA 12: + // - always execute normal kernel + // + // On CUDA 11 and below: + // - execute CUTLASS-based kernel on SM_80 and above + // - execute normal kernel otherwise. + + if constexpr (__CUDACC_VER_MAJOR__ == 12) { + // Always execute legacy kernels on CUDA 12 + ops::l2_exp_distance_op l2_op(perform_sqrt); + distance_matrix_dispatch( + l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); + } else { + const auto deviceVersion = getComputeCapability(); + if (deviceVersion.first >= 8) { + // If device is SM_80 or later, use CUTLASS-based kernel. + using L2Op = ops::l2_exp_cutlass_op; + L2Op l2_op(perform_sqrt); + + distance_matrix_cutlass_dispatch( + l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); + } else { + // Else use "legacy" L2 + ops::l2_exp_distance_op l2_op(perform_sqrt); + distance_matrix_dispatch( + l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); + } } -}; +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, + size_t worksize, + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + bool perform_sqrt = false; + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + distance_impl_l2_expanded( + perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, + size_t worksize, + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + bool perform_sqrt = true; + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + distance_impl_l2_expanded( + perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + bool perform_sqrt = false; + ops::l2_unexp_distance_op l2_op(perform_sqrt); + + // The unexpanded L2 does not require the norms of a and b to be calculated. + const DataT* norm_A = nullptr; + const DataT* norm_B = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + distance_matrix_dispatch( + l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + bool perform_sqrt = true; + ops::l2_unexp_distance_op l2_op(perform_sqrt); + + // The unexpanded L2 does not require the norms of a and b to be calculated. + const DataT* norm_A = nullptr; + const DataT* norm_B = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + distance_matrix_dispatch( + l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major); +} -} // anonymous namespace +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + ops::l_inf_distance_op distance_op{}; + + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT metric_arg) +{ + ops::lp_unexp_distance_op distance_op{metric_arg}; + + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} + +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT*, // workspace unused + size_t, // worksize unused + FinOpT fin_op, + bool is_row_major, + DataT) // metric_arg unused +{ + ops::russel_rao_distance_op distance_op{k}; + + const DataT* x_norm = nullptr; + const DataT* y_norm = nullptr; + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + distance_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} /** * @brief Evaluate pairwise distances with the user epilogue lamba allowed @@ -636,15 +728,17 @@ struct DistanceImpl distImpl; - distImpl.run(handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg); + // raft distance support inputs as float/double and output as uint8_t/float/double. + static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))), + "OutType can be uint8_t, float, double," + "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType)."); + + distance_impl( + handle, + distance_tag{}, + x, + y, + out, + m, + n, + k, + reinterpret_cast(workspace), + worksize, + fin_op, + isRowMajor, + metric_arg); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -691,24 +802,9 @@ void distance(raft::resources const& handle, * @param k dimensionality * @param workspace temporary workspace needed for computations * @param worksize number of bytes of the workspace + * @param stream cuda stream * @param isRowMajor whether the matrices are row-major or col-major - * - * @note if workspace is passed as nullptr, this will return in - * worksize, the number of bytes of workspace required */ - -// Default final op functor which facilitates elementwise operation on -// final distance value if any. -template -struct default_fin_op { - __host__ __device__ default_fin_op() noexcept {}; - // functor signature. - __host__ __device__ OutType operator()(AccType d_val, Index g_d_idx) const noexcept - { - return d_val; - } -}; - template ; - final_op_type fin_op; + auto fin_op = raft::identity_op(); - // raft distance support inputs as float/double and output as uint8_t/float/double. - static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))), - "OutType can be uint8_t, float, double," - "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType)."); - distance( - handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + distance( + handle, x, y, out, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg); } /** @@ -775,42 +865,6 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In return worksize; } -/** - * @defgroup pairwise_distance pairwise distance prims - * @{ - * @brief Convenience wrapper around 'distance' prim to convert runtime metric - * into compile time for the purpose of dispatch - * @tparam Type input/accumulation/output data-type - * @tparam Index_ indexing type - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace buffer which can get resized as per the - * needed workspace size - * @param metric distance metric - * @param isRowMajor whether the matrices are row-major or col-major - */ -template -void pairwise_distance_impl(raft::resources const& handle, - const Type* x, - const Type* y, - Type* dist, - Index_ m, - Index_ n, - Index_ k, - rmm::device_uvector& workspace, - bool isRowMajor, - Type metric_arg = 2.0f) -{ - auto worksize = getWorkspaceSize(x, y, m, n, k); - workspace.resize(worksize, raft::resource::get_cuda_stream(handle)); - distance( - handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg); -} -/** @} */ }; // namespace detail }; // namespace distance }; // namespace raft diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh new file mode 100644 index 0000000000..45bea08a95 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief The canberra distance matrix calculation + * + * It computes the following equation: + * + * c_ij = sum_k |x_ik - y_kj| / ( |x_ik| + |y_kj| ) + */ +template +struct canberra_distance_op { + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = true; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + const auto diff = raft::abs(x - y); + const auto add = raft::abs(x) + raft::abs(y); + // deal with potential for 0 in denominator by + // forcing 0/1 instead + acc += ((add != 0) * diff / (add + (add == 0))); + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + return; + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh new file mode 100644 index 0000000000..3832104280 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** @brief The correlation distance + * + * It computes the following equation: + * + * d(x, y) = ((x - mean(x)) â‹… (y - mean(y))) + * / + * (|| x - mean(x) ||_2 || y - mean(y) ||_2) + */ +template +struct correlation_distance_op { + const DataT* x2n; + const DataT* y2n; + IdxT m; + IdxT n; + IdxT k; + + correlation_distance_op( + bool is_row_major, const DataT* x2n_, const DataT* y2n_, IdxT m_, IdxT n_, IdxT k_) noexcept + : x2n(x2n_), y2n(y2n_), m(m_), n(n_), k(k_) + { + // The distance op is typically created before the row-major/col-major + // swapping has been done. So we do it here. + if (!is_row_major) { + std::swap(x2n, y2n); + std::swap(m, n); + } + } + + // Load norms of input data + static constexpr bool use_norms = true; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT)); + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + // Note how we can sneakily get a pointer to shared memory here, to store + // more data. If the implementation of PairwiseDistanceMatKernel ever + // changes, this will be where we find the bugs. + extern __shared__ char smem[]; + + DataT regx2n[Policy::AccRowsPerTh], regy2n[Policy::AccColsPerTh]; + + DataT* sx2Norm = + (DataT*)(&smem[Policy::SmemSize + (Policy::Mblk + Policy::Nblk) * sizeof(DataT)]); + DataT* sy2Norm = (&sx2Norm[Policy::Mblk]); + + // Load x & y norms required by this threadblock in shmem buffer + if (gridStrideX == blockIdx.x * Policy::Nblk) { + for (int i = threadIdx.x; i < Policy::Mblk; i += Policy::Nthreads) { + auto idx = gridStrideY + i; + sx2Norm[i] = idx < m ? x2n[idx] : 0; + } + } + + for (int i = threadIdx.x; i < Policy::Nblk; i += Policy::Nthreads) { + auto idx = gridStrideX + i; + sy2Norm[i] = idx < n ? y2n[idx] : 0; + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { + regx2n[i] = sx2Norm[i * Policy::AccThRows + (threadIdx.x / Policy::AccThCols)]; + } +#pragma unroll + for (int i = 0; i < Policy::AccColsPerTh; ++i) { + regy2n[i] = sy2Norm[i * Policy::AccThCols + (threadIdx.x % Policy::AccThCols)]; + } + +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); + auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); + auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); + + acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom)); + } + } + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh new file mode 100644 index 0000000000..c3f3b75e62 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the expanded cosine distance matrix calculation + * + * It computes the following equation: + * + * d(x, y) = 1 - (x â‹… y) / ( ||x||_2 ||y||_2) + */ +template +struct cosine_distance_op { + // Load norms of input data + static constexpr bool use_norms = true; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j])); + } + } + } +}; + +template +struct cosine_cutlass_op { + __device__ cosine_cutlass_op() noexcept {} + __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept + { + return static_cast(1.0) - (AccT)(accVal / (aNorm * bNorm)); + } + __device__ AccT operator()(DataT aData) const noexcept { return aData; } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh new file mode 100644 index 0000000000..98acf11560 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the Hamming Unexpanded distance matrix calculation + * It computes the following equation: + * + * c_ij = sum_k (x_ik != y_kj) / k + */ +template +struct hamming_distance_op { + IdxT k; + + hamming_distance_op(IdxT k_) noexcept : k(k_) {} + + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { acc += (x != y); }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + const DataT one_over_k = DataT(1.0) / k; +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] *= one_over_k; + } + } + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh new file mode 100644 index 0000000000..c5e2b84ac2 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the Hellinger distance matrix calculation + * + * It computes the following equation: + * + * c_ij = sqrt(1 - sum_k sqrt(x_ik * y_kj)) + * + */ +template +struct hellinger_distance_op { + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + // This is sqrt(x) * sqrt(y). + const auto product = x * y; + acc += product; + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative + const auto finalVal = (1 - acc[i][j]); + const auto rectifier = (!signbit(finalVal)); + acc[i][j] = raft::sqrt(rectifier * finalVal); + } + } + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh new file mode 100644 index 0000000000..df5aadcf3b --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft::distance::detail::ops { + +// Describes the computation the jensen_shannon distance + +/** + * @brief the Jensen Shannon distance matrix calculation + * + * It computes the following equation: + * + * c_ij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + * + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) + */ +template +struct jensen_shannon_distance_op { + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = true; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + const DataT m = 0.5f * (x + y); + const bool m_zero = (m == 0); + const auto logM = (!m_zero) * raft::log(m + m_zero); + + const bool x_zero = (x == 0); + const bool y_zero = (y == 0); + acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero))); + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = raft::sqrt(0.5 * acc[i][j]); + } + } + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh new file mode 100644 index 0000000000..526927243f --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the KL Divergence distance matrix calculation + * + * It computes the following equation: + * + * c_ij = 0.5 * sum(x * log (x / y)); + */ +template +struct kl_divergence_op { + const bool is_row_major; + const bool x_equal_y; + + kl_divergence_op(bool row_major_, bool x_equal_y_ = false) noexcept + : is_row_major(row_major_), x_equal_y(x_equal_y_) + { + } + + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = true; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + // TODO: make sure that these branches get hoisted out of main loop.. Could + // be quite expensive otherwise. + if (x_equal_y) { + if (is_row_major) { + const bool x_zero = (x == 0); + const bool y_zero = (y == 0); + acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero)); + } else { + const bool y_zero = (y == 0); + const bool x_zero = (x == 0); + acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero)); + } + } else { + if (is_row_major) { + const bool x_zero = (x == 0); + acc += x * (raft::log(x + x_zero) - y); + } else { + const bool y_zero = (y == 0); + acc += y * (raft::log(y + y_zero) - x); + } + } + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = (0.5f * acc[i][j]); + } + } + } +}; +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh new file mode 100644 index 0000000000..b02971bac7 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the L1 distance matrix calculation + * + * It computes the following equation: + * + * c_ij = sum_k abs(x_ik - y_kj) + */ +template +struct l1_distance_op { + // Do not load norms of data, the computation of L1 distance does not use them. + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { acc += raft::abs(x - y); }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + return; + }; +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh new file mode 100644 index 0000000000..fb00f8d66a --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the expanded euclidean distance matrix calculation + * + * It computes the following equation: + * + * c_ij = - 2 sum_k x_ik * y_kj + ||x_i.||_2 + ||y_.j||_2 + * + */ +template +struct l2_exp_distance_op { + bool sqrt; + + l2_exp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {} + + // Load norms of input data + static constexpr bool use_norms = true; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + DataT val = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j]; + acc[i][j] = val * (val > DataT(0.0)); + } + } + if (sqrt) { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = raft::sqrt(acc[i][j]); + } + } + } + } +}; + +// Epilogue operator for CUTLASS based kernel +template +struct l2_exp_cutlass_op { + bool sqrt; + + __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {} + __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {} + __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept + { + AccT outVal = aNorm + bNorm - DataT(2.0) * accVal; + // outVal could be negative due to numerical instability, especially when + // calculating self distance. + // clamp to 0 to avoid potential NaN in sqrt + outVal = outVal * (outVal > DataT(0.0)); + return sqrt ? raft::sqrt(outVal) : outVal; + } + + __device__ AccT operator()(DataT aData) const noexcept { return aData; } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh new file mode 100644 index 0000000000..e03eb0a97e --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the unexpanded euclidean distance matrix calculation + * + * It computes the following equation: + * + * c_ij = optional_sqrt ( sum_k (x_ik - y_kj)^2 ) + */ +template +struct l2_unexp_distance_op { + bool sqrt; + + l2_unexp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {} + + // Do not load norms of data, the computation of L1 distance does not use them. + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + const auto diff = x - y; + acc += diff * diff; + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + if (sqrt) { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = raft::sqrt(acc[i][j]); + } + } + } + }; +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh new file mode 100644 index 0000000000..caa1379133 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the L_inf (Chebyshev) distance matrix calculation + * + * It computes the following equation: + * + * c_ij = max_k | x_ik - y_kj | + */ +template +struct l_inf_distance_op { + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + const auto diff = raft::abs(x - y); + acc = raft::max(acc, diff); + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + return; + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh new file mode 100644 index 0000000000..a4a090d058 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the unexpanded Lp (Minkowski) distance matrix calculation + * + * It computes the following equation: + * + * c_ij = (sum_k |x_ik - y_jk|^p)^(1/p) + */ +template +struct lp_unexp_distance_op { + DataT p; + + lp_unexp_distance_op(DataT p_) noexcept : p(p_) {} + + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = true; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + const auto diff = raft::abs(x - y); + acc += raft::pow(diff, p); + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + const auto one_over_p = 1.0f / p; +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = raft::pow(acc[i][j], one_over_p); + } + } + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh new file mode 100644 index 0000000000..7acd858e49 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +/** + * @brief the Russell Rao distance matrix calculation + * + * It computes the following equation: + * + * c_ij = (k - (sum_k x_ik * y_kj)) / k + */ +template +struct russel_rao_distance_op { + IdxT k; + const float one_over_k; + + russel_rao_distance_op(IdxT k_) noexcept : k(k_), one_over_k(1.0f / k_) {} + + // Load norms of input data + static constexpr bool use_norms = false; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = (k - acc[i][j]) * one_over_k; + } + } + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh new file mode 100644 index 0000000000..b0f40123aa --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::distance::detail::ops { + +// Describes the computation the template distance +// +// Fill in the TODO items. + +template +struct template_distance_op { + TODO member; + + template_distance_op(TODO member_) noexcept : member(member_) {} + + // Load norms of input data + static constexpr bool use_norms = TODO; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + constexpr size_t shared_mem_size() + { + return Policy::SmemSize + TODO; + } + + DI void core(AccT& acc, DataT& x, DataT& y) const { TODO; }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { + TODO; + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh deleted file mode 100644 index 7c5fdb912c..0000000000 --- a/cpp/include/raft/distance/detail/euclidean.cuh +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace raft { -namespace distance { -namespace detail { - -template -struct L2ExpandedOp { - bool sqrt; - - __device__ L2ExpandedOp() noexcept : sqrt(false) {} - __device__ L2ExpandedOp(bool isSqrt) noexcept : sqrt(isSqrt) {} - __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept - { - AccT outVal = aNorm + bNorm - DataT(2.0) * accVal; - // outVal could be negative due to numerical instability, especially when - // calculating self distance. - // clamp to 0 to avoid potential NaN in sqrt - outVal = outVal * (outVal > DataT(0.0)); - return sqrt ? raft::sqrt(outVal) : outVal; - } - - __device__ AccT operator()(DataT aData) const noexcept { return aData; } -}; - -/** - * @brief the expanded euclidean distance matrix calculation implementer - * It computes the following equation: C = op(A^2 + B^2 - 2AB) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread for every LDG call - * it makes. check contractions.cuh for details. - * @tparam FinalLambda the final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] xn row norms of input matrix A. - * @param[in] yn row norms of input matrix B. - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[in] sqrt if the square root is computed or not - * @param[output] pD output matrix - * @param fin_op the final gemm epilogue lambda -* @param stream cuda stream to launch cuda operations. - */ -template -void euclideanExpImpl(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ -#if (__CUDACC_VER_MAJOR__ < 12) - const auto deviceVersion = getComputeCapability(); - if (deviceVersion.first >= 8) { - using L2Op = L2ExpandedOp; - L2Op L2_dist_op(sqrt); - - cutlassDistanceKernel( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream); - - } else -#endif - { - - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - DataT val = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j]; - acc[i][j] = val * (val > DataT(0.0)); - } - } - if (sqrt) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(acc[i][j]); - } - } - } - }; - - constexpr size_t shmemSize = - KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); - if (isRowMajor) { - auto euclideanExpRowMajor = pairwiseDistanceMatKernelPriorToAmpere; - dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); - - euclideanExpRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto euclideanExpColMajor = pairwiseDistanceMatKernelPriorToAmpere; - dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); - euclideanExpColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void euclideanExp(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanExpImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanExpImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); - } else { - euclideanExpImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); - } -} - -/** - * @brief the expanded euclidean distance matrix calculation - * It computes the following equation: C = op(A^2 + B^2 - 2AB) - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda the final lambda called by FragmentMultiplyAdd_ - * @tparam Index_ index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param enable_sqrt if the square root is computed or not - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param fin_op the final gemm epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void euclideanAlgo1(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - bool enable_sqrt, - AccType* workspace, - size_t& worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - // raft distance support inputs as float/double and output as uint8_t/float/double. - static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))), - "OutType can be uint8_t, float, double," - "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType)."); - typedef typename std::conditional::type ExpOutType; - ExpOutType* pDcast = reinterpret_cast(pD); - - ASSERT( - !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), - "workspace size error"); - ASSERT(workspace != nullptr, "workspace is null"); - - Index_ lda, ldb, ldd; - InType* col_vec = workspace; - InType* row_vec = workspace; - if (pA != pB) { - row_vec += m; - raft::linalg::rowNorm( - col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{}); - raft::linalg::rowNorm( - row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{}); - } else { - raft::linalg::rowNorm( - col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{}); - } - - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - euclideanExp( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream); - } else { - lda = n, ldb = m, ldd = m; - euclideanExp( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream); - } -} - -/** - * @brief the unexpanded euclidean distance matrix calculation - * It computes the following equation: cij = op((ai-bj)^2) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam FinalLambda final lambda called on final distance value - * - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[in] sqrt if the square root is computed or not - * @param[output] pD output matrix - * @param fin_op the final gemm epilogue lambda - */ -template -void euclideanUnExpImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = x - y; - acc += diff * diff; - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { - if (sqrt) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(acc[i][j]); - } - } - } - }; - - if (isRowMajor) { - auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor); - - euclideanUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - - } else { - auto euclideanUnExpColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpColMajor); - - euclideanUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void euclideanUnExp(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); - } else { - euclideanUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); - } -} - -/** - * @brief the unexpanded euclidean distance matrix calculation - * It computes the following equation: cij = op((ai-bj)^2) - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param enable_sqrt if the square root is computed or not - * @param fin_op the final gemm epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void euclideanAlgo2(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - bool enable_sqrt, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type UnExpOutType; - UnExpOutType* pDcast = reinterpret_cast(pD); - Index_ lda, ldb, ldd; - - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - euclideanUnExp( - m, n, k, lda, ldb, ldd, pA, pB, enable_sqrt, pDcast, fin_op, stream); - } else { - lda = n, ldb = m, ldd = m; - euclideanUnExp( - n, m, k, lda, ldb, ldd, pB, pA, enable_sqrt, pDcast, fin_op, stream); - } -} - -}; // end namespace detail -}; // end namespace distance -}; // end namespace raft diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh deleted file mode 100644 index bed9d09e3e..0000000000 --- a/cpp/include/raft/distance/detail/hamming.cuh +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the Hamming distance matrix using the unexpanded form: - * It computes the following equation: - Cij = sum(x_i != y_i) / k - * - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void hammingUnexpandedImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { - const DataT one_over_k = DataT(1.0) / k; -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] *= one_over_k; - } - } - }; - - if (isRowMajor) { - auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor); - - hammingUnexpandedRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor); - hammingUnexpandedColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void hammingUnexpanded(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hammingUnexpandedImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hammingUnexpandedImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - hammingUnexpandedImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the Hamming Unexpanded distance matrix calculation - * It computes the following equation: - Cij = sum(x_i != y_i) / k - * - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void hammingUnexpandedImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef - typename std::conditional::type hammingUnexpandedOutType; - Index_ lda, ldb, ldd; - hammingUnexpandedOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - hammingUnexpanded( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - - } else { - lda = n, ldb = m, ldd = m; - hammingUnexpanded( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} - -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh deleted file mode 100644 index 13507fe84f..0000000000 --- a/cpp/include/raft/distance/detail/hellinger.cuh +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the Hellinger distance matrix using the expanded form: - * It computes the following equation: - cij = sqrt(1 - sum(sqrt(x_k * y_k))) - * This distance computation modifies A and B by computing a sqrt - * and then performing a `pow(x, 2)` to convert it back. Because of this, - * it is possible that the values in A and B might differ slightly - * after this is invoked. - * - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void hellingerImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // First sqrt x and y - raft::linalg::unaryOp((DataT*)x, x, m * k, raft::sqrt_op{}, stream); - if (x != y) { - raft::linalg::unaryOp((DataT*)y, y, n * k, raft::sqrt_op{}, stream); - } - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - // This is sqrt(x) * sqrt(y). - const auto product = x * y; - acc += product; - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); - const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::sqrt(rectifier * finalVal); - } - } - }; - - if (isRowMajor) { - auto hellingerRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerRowMajor); - - hellingerRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto hellingerColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerColMajor); - hellingerColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - // Revert sqrt of x and y - raft::linalg::unaryOp((DataT*)x, x, m * k, raft::sqrt_op{}, stream); - if (x != y) { - raft::linalg::unaryOp((DataT*)y, y, n * k, raft::sqrt_op{}, stream); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void hellinger(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hellingerImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hellingerImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - hellingerImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the Hellinger distance matrix calculation - * It computes the following equation: - sqrt(1 - sum(sqrt(x_k * y_k)) - * This distance computation modifies A and B by computing a sqrt - * and then performing a `pow(x, 2)` to convert it back. Because of this, - * it is possible that the values in A and B might differ slightly - * after this is invoked. - * - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void hellingerImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type hellingerOutType; - Index_ lda, ldb, ldd; - hellingerOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - hellinger( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - - } else { - lda = n, ldb = m, ldd = m; - hellinger( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh deleted file mode 100644 index f96da01b87..0000000000 --- a/cpp/include/raft/distance/detail/jensen_shannon.cuh +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the Jensen Shannon distance matrix: - * It computes the following equation: - Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) - + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) - * - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void jensenShannonImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const DataT m = 0.5f * (x + y); - const bool m_zero = (m == 0); - const auto logM = (!m_zero) * raft::log(m + m_zero); - - const bool x_zero = (x == 0); - const bool y_zero = (y == 0); - acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero))); - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(0.5 * acc[i][j]); - } - } - }; - - if (isRowMajor) { - auto jensenShannonRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, jensenShannonRowMajor); - - jensenShannonRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto jensenShannonColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, jensenShannonColMajor); - jensenShannonColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void jensenShannon(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - jensenShannonImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - jensenShannonImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - jensenShannonImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the Jensen Shannon distance matrix calculation - * It computes the following equation: - Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) - + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) - * - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void jensenShannonImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type jensenShannonOutType; - Index_ lda, ldb, ldd; - jensenShannonOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - jensenShannon( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - - } else { - lda = n, ldb = m, ldd = m; - jensenShannon( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh deleted file mode 100644 index 7ebeaf4de9..0000000000 --- a/cpp/include/raft/distance/detail/kl_divergence.cuh +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the KL Divergence distance matrix: - * It computes the following equation: - Cij = 0.5 * sum(x * log (x / y)); - * This distance computation modifies A or B by computing a log(x) - * and then performing a `pow(e, log(x))` to convert it back. Because of this, - * it is possible that the values in A or B might differ slightly - * after this is invoked. - * - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void klDivergenceImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - if (isRowMajor) { - const bool x_zero = (x == 0); - acc += x * (raft::log(x + x_zero) - y); - } else { - const bool y_zero = (y == 0); - acc += y * (raft::log(y + y_zero) - x); - } - }; - - auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) { - if (isRowMajor) { - const bool x_zero = (x == 0); - const bool y_zero = (y == 0); - acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero)); - } else { - const bool y_zero = (y == 0); - const bool x_zero = (x == 0); - acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero)); - } - }; - - auto unaryOp_lambda = [] __device__(DataT input) { - const bool x_zero = (input == 0); - return (!x_zero) * raft::log(input + x_zero); - }; - - auto unaryOp_lambda_reverse = [] __device__(DataT input) { - // reverse previous log (x) back to x using (e ^ log(x)) - const bool x_zero = (input == 0); - return (!x_zero) * raft::exp(input); - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = (0.5f * acc[i][j]); - } - } - }; - - if (isRowMajor) { - constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel; - constexpr auto klDivergenceRowMajorXequalY = - pairwiseDistanceMatKernel; - if (x != y) { - raft::linalg::unaryOp( - (DataT*)y, y, n * k, unaryOp_lambda, stream); - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceRowMajor); - klDivergenceRowMajor<<>>(x, - y, - nullptr, - nullptr, - m, - n, - k, - lda, - ldb, - ldd, - dOutput, - core_lambda, - epilog_lambda, - fin_op); - // Now reverse previous log (x) back to x using (e ^ log(x)) - raft::linalg::unaryOp( - (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream); - } else { - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY); - klDivergenceRowMajorXequalY<<>>(x, - y, - nullptr, - nullptr, - m, - n, - k, - lda, - ldb, - ldd, - dOutput, - core_lambda_x_equal_y, - epilog_lambda, - fin_op); - } - } else { - constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel; - constexpr auto klDivergenceColMajorXequalY = - pairwiseDistanceMatKernel; - if (x != y) { - raft::linalg::unaryOp( - (DataT*)x, x, m * k, unaryOp_lambda, stream); - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceColMajor); - klDivergenceColMajor<<>>(x, - y, - nullptr, - nullptr, - m, - n, - k, - lda, - ldb, - ldd, - dOutput, - core_lambda, - epilog_lambda, - fin_op); - // Now reverse previous log (x) back to x using (e ^ log(x)) - raft::linalg::unaryOp( - (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream); - } else { - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY); - klDivergenceColMajorXequalY<<>>(x, - y, - nullptr, - nullptr, - m, - n, - k, - lda, - ldb, - ldd, - dOutput, - core_lambda_x_equal_y, - epilog_lambda, - fin_op); - } - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void klDivergence(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - klDivergenceImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - klDivergenceImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - klDivergenceImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the KL Divergence distance matrix calculation - * It computes the following equation: - Cij = 0.5 * sum(x * log (x / y)); - * This distance computation modifies A or B by computing a log(x) - * and then performing a `pow(e, log(x))` to convert it back. Because of this, - * it is possible that the values in A or B might differ slightly - * after this is invoked. - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void klDivergenceImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type klDivergenceOutType; - Index_ lda, ldb, ldd; - klDivergenceOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - klDivergence( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - - } else { - lda = n, ldb = m, ldd = m; - klDivergence( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh deleted file mode 100644 index bf10651b60..0000000000 --- a/cpp/include/raft/distance/detail/l1.cuh +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the L1 distance matrix calculation implementer - * It computes the following equation: cij = op(ai-bj) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] pD output matrix - * @param fin_op the final gemm epilogue lambda - */ -template -static void l1Impl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::abs(x - y); - acc += diff; - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = raft::void_op(); - - if (isRowMajor) { - auto l1RowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); - - l1RowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - auto l1ColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); - l1ColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void l1(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - l1Impl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - l1Impl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - l1Impl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the L1 distance matrix calculation - * It computes the following equation: cij = op(ai-bj) - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void l1Impl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type L1OutType; - Index_ lda, ldb, ldd; - L1OutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - l1( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - - } else { - lda = n, ldb = m, ldd = m; - l1( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh deleted file mode 100644 index 5594cbf5f1..0000000000 --- a/cpp/include/raft/distance/detail/minkowski.cuh +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the unexpanded Minkowski distance matrix calculation - * It computes the following equation: cij = sum(|x - y|^p)^(1/p) - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and cols of C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] pD output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - * @param[in] the value of `p` for Minkowski (l-p) distances. - */ -template -void minkowskiUnExpImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream, - DataT p) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) { - const auto diff = raft::abs(x - y); - acc += raft::pow(diff, p); - }; - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { - const auto one_over_p = 1.0f / p; -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = raft::pow(acc[i][j], one_over_p); - } - } - }; - - if constexpr (isRowMajor) { - auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor); - - minkowskiUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - - } else { - auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor); - - minkowskiUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void minkowskiUnExp(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream, - DataT metric_arg) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - minkowskiUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); - } else { - minkowskiUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); - } -} - -/** - * @brief the unexpanded minkowski distance matrix calculation - * It computes the following equation: cij = sum(|x - y|^p)^(1/p) - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ index type - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and cols of C/D - * @param[in] k number of cols of A and B - * @param[in] pA input matrix - * @param[in] pB input matrix - * @param[out] pD output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - * @param[in] isRowMajor whether the input and output matrices are row major - * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. - */ -template -void minkowskiImpl(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type LpUnexpOutType; - LpUnexpOutType* pDcast = reinterpret_cast(pD); - Index_ lda, ldb, ldd; - - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - minkowskiUnExp( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream, metric_arg); - } else { - lda = n, ldb = m, ldd = m; - minkowskiUnExp( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream, metric_arg); - } -} -}; // end namespace detail -}; // end namespace distance -}; // end namespace raft diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh index 293600ed21..0293f10c29 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh @@ -271,170 +271,6 @@ struct PairwiseDistances : public BaseClass { } }; // struct PairwiseDistances -/** - * @brief the distance matrix calculation kernel for L1, L2 and cosine - * @tparam useNorms whether norms are needed - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda lambda which implements accumulation operation - * @tparam EpilogueLambda lambda which implements operation for calculating - final value. - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major(default), - false for column major - * - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] xn row norms of input matrix A. - * @param[in] yn row norms of input matrix B. - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] pD output matrix - * @param core_op the core lambda - * @param epilog_op the epilogue lambda - * @param fin_op the final gemm epilogue lambda - */ - -template -__global__ __launch_bounds__(Policy::Nthreads, 2) - - void pairwiseDistanceMatKernel(const DataT* x, - const DataT* y, - const DataT* _xn, - const DataT* _yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - CoreLambda core_op, - EpilogueLambda epilog_op, - FinalLambda fin_op) -{ - extern __shared__ char smem[]; - auto rowEpilog = [] __device__(IdxT starty) { return; }; - - PairwiseDistances - obj( - x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); - obj.run(); -} - -/** - * @brief the distance matrix calculation kernel for L2 and cosine - * for GPU arch < SM 8.0, this version is to make sure we don't recompile - * these kernels for ampere or higher as we use cutlass kernel for it. - * @tparam useNorms whether norms are needed - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda lambda which implements accumulation operation - * @tparam EpilogueLambda lambda which implements operation for calculating - final value. - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major(default), - false for column major - * - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] xn row norms of input matrix A. - * @param[in] yn row norms of input matrix B. - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] pD output matrix - * @param core_op the core lambda - * @param epilog_op the epilogue lambda - * @param fin_op the final gemm epilogue lambda - */ - -template -__global__ __launch_bounds__(Policy::Nthreads, 2) - - void pairwiseDistanceMatKernelPriorToAmpere(const DataT* x, - const DataT* y, - const DataT* _xn, - const DataT* _yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - CoreLambda core_op, - EpilogueLambda epilog_op, - FinalLambda fin_op) -{ - //#if __CUDA_ARCH__ < 800 - // TODO: re-enable the CUDA_ARCH guard for below Ampere once cutlass based - // kernels are enabled for CUDA 12.0 - extern __shared__ char smem[]; - auto rowEpilog = [] __device__(IdxT starty) { return; }; - - PairwiseDistances - obj( - x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); - obj.run(); - //#endif -} - template dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) { @@ -462,4 +298,4 @@ dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) }; // namespace detail }; // namespace distance -}; // namespace raft \ No newline at end of file +}; // namespace raft diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh index f39d880da4..2ab5c69b0d 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,8 +19,6 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" -#if (__CUDACC_VER_MAJOR__ < 12) - // We define CUTLASS_NAMESPACE in case // RAFT cmake is not used #ifndef CUTLASS_NAMESPACE @@ -171,8 +169,8 @@ void cutlassDistanceKernel(const DataT* x, CUTLASS_CHECK(status); } -}; // namespace detail -}; // namespace distance -}; // namespace raft -#endif // (__CUDACC_VER_MAJOR__ < 12) +}; // namespace detail +}; // namespace distance +}; // namespace raft + #pragma GCC diagnostic pop diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh new file mode 100644 index 0000000000..9def354600 --- /dev/null +++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "kernel_sm60.cuh" +#include +#include +#include +#include +#include + +namespace raft::distance::detail { + +/** + * @brief: Computes minimal common alignment of the rows in a 2D array in bytes + * + * The 2D matrix `x` is assumed to be row-major. This function computes the + * minimal alignment in bytes of the first elements of each row. + * Output can be 16, 8, 4, 2, 1. + * + * @param x Base pointer of row-major input matrix + * @param stride Stride in number of element between consecutive rows. + */ +template +size_t alignment_of_2d_array(const DataT* x, size_t stride) +{ + auto base = reinterpret_cast(x); + size_t stride_bytes = sizeof(DataT) * stride; + + for (int align = 16; align >= 0; align /= 2) { + bool base_aligned = base % align == 0; + bool stride_aligned = stride_bytes % align == 0; + if (base_aligned && stride_aligned) { return align; } + } + return 1; +} + +template +using vec_len_constant = std::integral_constant; + +/** + * @brief: Converts run-time arguments to compile-time arguments + * + * Converts run-time arguments row_major and vec_len to compile-time arguments + * and dispatches a lambda f with these compile-time arguments. + * + * This is equivalent to copying and pasting the lambda function `f` in each of + * the switch case statements. + * + * @tparam F Type of lambda f. + * @param row_major Boolean indicating whether input arrays have row-major layout. + * @param vec_len Integer value 1, 2, or 4 specifying the Veclen template parameter of + * the KernelPolicy. + * @param f Lambda that takes two std::integral_constant parameters representing + * row_major and vec_len. + */ +template +void dispatch(bool row_major, int vec_len, F&& f) +{ + if (row_major) { + switch (vec_len) { + case 4: f(std::bool_constant(), vec_len_constant<4>()); break; + case 2: f(std::bool_constant(), vec_len_constant<2>()); break; + default: f(std::bool_constant(), vec_len_constant<1>()); break; + } + } else { + switch (vec_len) { + case 4: f(std::bool_constant(), vec_len_constant<4>()); break; + case 2: f(std::bool_constant(), vec_len_constant<2>()); break; + default: f(std::bool_constant(), vec_len_constant<1>()); break; + } + } +} + +template +void distance_matrix_dispatch(OpT distance_op, + IdxT m, + IdxT n, + IdxT k, + const DataT* x, + const DataT* y, + const DataT* x_norm, + const DataT* y_norm, + OutT* out, + FinOpT fin_op, + cudaStream_t stream, + bool is_row_major) +{ + // Determine leading dimensions and, if column-major, flip order of passing x + // and y. + IdxT ldx, ldy, ld_out; + if (is_row_major) { + ldx = k, ldy = k, ld_out = n; + } else { + // Flip x, y, and m, n. + std::swap(x, y); + std::swap(x_norm, y_norm); + std::swap(m, n); + ldx = m, ldy = n, ld_out = n; + } + + size_t align_x = alignment_of_2d_array(x, ldx); + size_t align_y = alignment_of_2d_array(y, ldy); + size_t byte_alignment = min(align_x, align_y); + + // Since alignment is in bytes, it could be smaller than sizeof(DataT). + // Handle this (unlikely) case here. + RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, + "Input matrix must be aligned to size of elements."); + + // Compute number of elements that can be loaded in one instruction + // without causing misalignent errors. + int vec_len_aligned; + if (byte_alignment % sizeof(DataT) == 0) { + // In the future, we might support `int8_t` input. In that case, + // byte_alignment / sizeof(DataT) might exceed 4. We maximize at 4 here, to + // prevent adding more cases in dispatch (which are expensive to compile). + vec_len_aligned = std::min(4, int(byte_alignment / sizeof(DataT))); + } else { + vec_len_aligned = 1; + } + + dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) { + // row_major and vec_len are std::integral_constants of type bool and int + // respectively. + + // To keep compile times in check, we only specialize on veclen > 1 when + // the inner loop is relatively cheap (< 5 flops). + constexpr int vec_len_op = distance_op.expensive_inner_loop ? 1 : vec_len_aligned(); + + // Prevent double, vec_len=4 combination (this is not supported) + constexpr int vec_len = std::min(vec_len_op, static_cast(16 / sizeof(DataT))); + + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + typedef typename std::conditional::type Policy; + + return pairwise_matrix( + distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream); + }); +} + +template +void distance_matrix_cutlass_dispatch(opT cutlass_op, + IdxT m, + IdxT n, + IdxT k, + const DataT* x, + const DataT* y, + const DataT* x_norm, + const DataT* y_norm, + OutT* out, + FinOpT fin_op, + cudaStream_t stream, + bool is_row_major) +{ + // Determine leading dimensions and possibly flip order of passing x and y if + // column_major. + IdxT ldx, ldy, ld_out; + if (is_row_major) { + ldx = k, ldy = k, ld_out = n; + } else { + std::swap(x, y); + std::swap(x_norm, y_norm); + std::swap(m, n); + ldx = m, ldy = n, ld_out = n; + } + + size_t align_x = alignment_of_2d_array(x, ldx); + size_t align_y = alignment_of_2d_array(y, ldy); + size_t byte_alignment = min(align_x, align_y); + + // Since alignment is in bytes, it could be smaller than sizeof(DataT). + // Handle this (unlikely) case here. + RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, + "Input matrix must be aligned to size of elements."); + + // Compute number of elements that can be loaded in one instruction + // without causing misalignent errors. + int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1; + + dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) { + // row_major and vec_len are std::integral_constants of type bool and int + // respectively. + + // Prevent double, vec_len=4 combination (this is not supported) + constexpr int vec_len = std::min(vec_len_aligned(), static_cast(16 / sizeof(DataT))); + + cutlassDistanceKernel( + x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream); + }); +} + +}; // namespace raft::distance::detail diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh new file mode 100644 index 0000000000..7c1052d726 --- /dev/null +++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace raft::distance::detail { + +template +__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + opT distance_op, + FinOpT fin_op) +{ + extern __shared__ char smem[]; + + // Wrap operator back into lambdas. This is temporary and should be removed. + // See: https://github.com/rapidsai/raft/issues/1323 + auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) { + distance_op.core(acc, x, y); + }; + auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { + // Use .template to disambiguate (See: + // https://en.cppreference.com/w/cpp/language/dependent_name) + distance_op.template epilog(acc, regxn, regyn, gridStrideX, gridStrideY); + }; + + // No support for row_epilog_op. + auto row_epilog_op = raft::void_op(); + + // Always write output + constexpr bool write_out = true; + constexpr bool use_norms = distance_op.use_norms; + PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + _xn, + _yn, + dOutput, + smem, + core_op, + epilog_op, + fin_op, + row_epilog_op); + obj.run(); +} + +template +void pairwise_matrix(OpT distance_op, + FinOpT fin_op, + const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + cudaStream_t stream) +{ + dim3 blk(Policy::Nthreads); + // Use .template to disambiguate (See: + // https://en.cppreference.com/w/cpp/language/dependent_name) + size_t smem_size = distance_op.template shared_mem_size(); + // Obtain function pointer to kernel + auto kernel = pairwise_matrix_kernel; + dim3 grid = launchConfigGenerator(m, n, smem_size, kernel); + + kernel<<>>( + x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +}; // namespace raft::distance::detail diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh deleted file mode 100644 index 5d516e7830..0000000000 --- a/cpp/include/raft/distance/detail/russell_rao.cuh +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -namespace raft { -namespace distance { -namespace detail { - -/** - * @brief the Russell Rao distance matrix: - * It computes the following equation: - Cij = (k - sum(x_i * y_i)) / k - * - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Veclen number of k-elements loaded by each thread - for every LDG call. details in contractions.cuh - * @tparam FinalLambda final lambda called on final distance value - * @tparam isRowMajor true if input/output is row major, - false for column major - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of rows of B and C/D - * @param[in] k number of cols of A and B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[output] dOutput output matrix - * @param[in] fin_op the final gemm epilogue lambda - * @param[in] stream cuda stream to launch work - */ -template -static void russellRaoImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - typedef typename raft::linalg::Policy4x4::Policy RowPolicy; - typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - - typedef typename std::conditional::type KPolicy; - - dim3 blk(KPolicy::Nthreads); - - // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; - - const float one_over_k = 1.0 / k; - // epilogue operation lambda for final value calculation - auto epilog_lambda = [k, one_over_k] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { -#pragma unroll - for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - acc[i][j] = (k - acc[i][j]) * one_over_k; - } - } - }; - - if (isRowMajor) { - constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, russellRaoRowMajor); - - russellRaoRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } else { - constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, russellRaoColMajor); - russellRaoColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); - } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void russellRao(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ - size_t bytesA = sizeof(DataT) * lda; - size_t bytesB = sizeof(DataT) * ldb; - if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - russellRaoImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - russellRaoImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } else { - russellRaoImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); - } -} - -/** - * @brief the Russell Rao distance matrix calculation - * It computes the following equation: - Cij = (k - sum(x_i * y_i)) / k - * - * @tparam InType input data-type (for A and B matrices) - * @tparam AccType accumulation data-type - * @tparam OutType output data-type (for C and D matrices) - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * @param m number of rows of A and C/D - * @param n number of columns of B and C/D - * @param k number of cols of A and rows of B - * @param pA input matrix - * @param pB input matrix - * @param pD output matrix - * @param fin_op the final element-wise epilogue lambda - * @param stream cuda stream where to launch work - * @param isRowMajor whether the input and output matrices are row major - */ -template -void russellRaoImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ - typedef std::is_same is_bool; - typedef typename std::conditional::type russellRaoOutType; - Index_ lda, ldb, ldd; - russellRaoOutType* pDcast = reinterpret_cast(pD); - if (isRowMajor) { - lda = k, ldb = k, ldd = n; - russellRao( - m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); - - } else { - lda = n, ldb = m, ldd = m; - russellRao( - n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); - } -} -} // namespace detail -} // namespace distance -} // namespace raft diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 843a22c593..5216902635 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -101,9 +102,6 @@ void distance(raft::resources const& handle, * @param worksize number of bytes of the workspace * @param isRowMajor whether the matrices are row-major or col-major * @param metric_arg metric argument (used for Minkowski distance) - * - * @note if workspace is passed as nullptr, this will return in - * worksize, the number of bytes of workspace required */ template (x, y, m, n, k); + workspace.resize(worksize, stream); + detail::distance( + handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg); + }; + switch (metric) { - case raft::distance::DistanceType::L2Expanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::Canberra: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::L2SqrtExpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::CorrelationExpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::CosineExpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::CosineExpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::L1: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::HammingUnexpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::L2Unexpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::HellingerExpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::L2SqrtUnexpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case raft::distance::DistanceType::InnerProduct: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::Linf: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::JensenShannon: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::HellingerExpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::KLDivergence: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::LpUnexpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor, metric_arg); + case DistanceType::L1: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::Canberra: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::L2Expanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::HammingUnexpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::L2SqrtExpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::JensenShannon: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::L2SqrtUnexpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::RusselRaoExpanded: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::L2Unexpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::KLDivergence: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::Linf: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::CorrelationExpanded: - detail:: - pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::LpUnexpanded: + dispatch(std::integral_constant{}); break; - case raft::distance::DistanceType::InnerProduct: - detail::pairwise_distance_impl( - handle, x, y, dist, m, n, k, workspace, isRowMajor); + case DistanceType::RusselRaoExpanded: + dispatch(std::integral_constant{}); break; default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; diff --git a/cpp/include/raft/distance/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp deleted file mode 100644 index f6eb4614f9..0000000000 --- a/cpp/include/raft/distance/distance_type.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This file is deprecated and will be removed at some point in a future release. - * Please use `raft/distance/distance_types.hpp` instead. - */ - -#pragma once - -#pragma message(__FILE__ \ - " is deprecated and will be removed in a future release." \ - " Please use distance_types.hpp instead.") - -#include \ No newline at end of file diff --git a/cpp/include/raft/distance/specializations/detail/chebyshev.cuh b/cpp/include/raft/distance/specializations/detail/l_inf.cuh similarity index 100% rename from cpp/include/raft/distance/specializations/detail/chebyshev.cuh rename to cpp/include/raft/distance/specializations/detail/l_inf.cuh diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh index a0c35ca9a8..8daa398b49 100644 --- a/cpp/include/raft/distance/specializations/distance.cuh +++ b/cpp/include/raft/distance/specializations/distance.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh index 9301580a9e..b15cb222b4 100644 --- a/cpp/include/raft/linalg/detail/contractions.cuh +++ b/cpp/include/raft/linalg/detail/contractions.cuh @@ -318,4 +318,4 @@ struct Contractions_NT { } // namespace detail } // namespace linalg -} // namespace raft \ No newline at end of file +} // namespace raft diff --git a/cpp/src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu similarity index 100% rename from cpp/src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu rename to cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu diff --git a/cpp/src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu similarity index 100% rename from cpp/src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu rename to cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index df8fa54116..7cd0feba61 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -119,19 +119,19 @@ if(BUILD_TESTS) PATH test/distance/dist_adj.cu test/distance/dist_canberra.cu - test/distance/dist_chebyshev.cu test/distance/dist_correlation.cu test/distance/dist_cos.cu - test/distance/dist_euc_exp.cu - test/distance/dist_euc_unexp.cu - test/distance/dist_eucsqrt_exp.cu test/distance/dist_hamming.cu test/distance/dist_hellinger.cu test/distance/dist_inner_product.cu test/distance/dist_jensen_shannon.cu test/distance/dist_kl_divergence.cu test/distance/dist_l1.cu - test/distance/dist_minkowski.cu + test/distance/dist_l2_exp.cu + test/distance/dist_l2_unexp.cu + test/distance/dist_l2_sqrt_exp.cu + test/distance/dist_l_inf.cu + test/distance/dist_lp_unexp.cu test/distance/dist_russell_rao.cu test/distance/masked_nn.cu test/distance/masked_nn_compress_to_bits.cu diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_l2_exp.cu similarity index 98% rename from cpp/test/distance/dist_euc_exp.cu rename to cpp/test/distance/dist_l2_exp.cu index 567e279691..ae67215e51 100644 --- a/cpp/test/distance/dist_euc_exp.cu +++ b/cpp/test/distance/dist_l2_exp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/distance/dist_eucsqrt_exp.cu b/cpp/test/distance/dist_l2_sqrt_exp.cu similarity index 98% rename from cpp/test/distance/dist_eucsqrt_exp.cu rename to cpp/test/distance/dist_l2_sqrt_exp.cu index d717158649..94d254f44b 100644 --- a/cpp/test/distance/dist_eucsqrt_exp.cu +++ b/cpp/test/distance/dist_l2_sqrt_exp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_l2_unexp.cu similarity index 98% rename from cpp/test/distance/dist_euc_unexp.cu rename to cpp/test/distance/dist_l2_unexp.cu index 311ad190e2..d74a41d2a4 100644 --- a/cpp/test/distance/dist_euc_unexp.cu +++ b/cpp/test/distance/dist_l2_unexp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_l_inf.cu similarity index 98% rename from cpp/test/distance/dist_chebyshev.cu rename to cpp/test/distance/dist_l_inf.cu index abad828de7..b9d6413a10 100644 --- a/cpp/test/distance/dist_chebyshev.cu +++ b/cpp/test/distance/dist_l_inf.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_lp_unexp.cu similarity index 98% rename from cpp/test/distance/dist_minkowski.cu rename to cpp/test/distance/dist_lp_unexp.cu index af2661da3a..9d6f5921a7 100644 --- a/cpp/test/distance/dist_minkowski.cu +++ b/cpp/test/distance/dist_lp_unexp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.