diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh new file mode 100644 index 0000000000..b87c295eb0 --- /dev/null +++ b/cpp/include/raft/distance/canberra.cuh @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the canberra distance matrix calculation implementer + * It computes the following equation: cij = max(cij, op(ai-bj)) + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and cols of C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param fin_op the final gemm epilogue lambda + * @param stream cuda stream to launch work + */ +template +static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + const auto diff = raft::L1Op()(x - y); + const auto add = raft::myAbs(x) + raft::myAbs(y); + // deal with potential for 0 in denominator by + // forcing 1/0 instead + acc += ((add != 0) * diff / (add + (add == 0))); + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { return; }; + + if (isRowMajor) { + auto canberraRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); + + canberraRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { + auto canberraColMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); + canberraColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else { + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the canberra distance matrix calculation + * It computes the following equation: cij = max(cij, op(ai-bj)) + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and cols of C/D + * @param[in] k number of cols of A and B + * @param[in] pA input matrix + * @param[in] pB input matrix + * @param[out] pD output matrix + * @param[in] fin_op the final element-wise epilogue lambda + * @param[in] stream cuda stream to launch work + * @param[in] isRowMajor whether the input and output matrices are row major + */ +template +void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + canberraOutType; + Index_ lda, ldb, ldd; + canberraOutType *pDcast = reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + canberra( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + } else { + lda = n, ldb = m, ldd = m; + canberra( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/chebyshev.cuh new file mode 100644 index 0000000000..8d53408cf8 --- /dev/null +++ b/cpp/include/raft/distance/chebyshev.cuh @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the Chebyshev distance matrix calculation implementer + * It computes the following equation: cij = max(cij, op(ai-bj)) + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and cols of C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[out] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + const auto diff = raft::L1Op()(x - y); + acc = raft::myMax(acc, diff); + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { return; }; + + if (isRowMajor) { + auto chebyshevRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + chebyshevRowMajor); + + chebyshevRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { + auto chebyshevColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + chebyshevColMajor); + chebyshevColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else { + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the chebyshev distance matrix calculation + * It computes the following equation: cij = max(cij, op(ai-bj)) + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and cols of C/D + * @param[in] k number of cols of A and B + * @param[in] pA input matrix + * @param[in] pB input matrix + * @param[out] pD output matrix + * @param[in] fin_op the final element-wise epilogue lambda + * @param[in] stream cuda stream to launch work + * @param[in] isRowMajor whether the input and output matrices are row major + */ +template +void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + chebyshevOutType; + Index_ lda, ldb, ldd; + chebyshevOutType *pDcast = reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + chebyshev( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + } else { + lda = n, ldb = m, ldd = m; + chebyshev( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 579b3bb446..1b39a6ec18 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -19,9 +19,13 @@ #include #include #include +#include +#include #include #include +#include #include +#include #include namespace raft { @@ -34,7 +38,7 @@ template { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::euclideanAlgo1(m, n, k, x, y, dist, false, (AccType *)workspace, worksize, @@ -57,7 +61,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::euclideanAlgo1(m, n, k, x, y, dist, true, (AccType *)workspace, worksize, @@ -71,7 +75,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::cosineAlgo1( m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, isRowMajor); @@ -84,7 +88,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::euclideanAlgo2(m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); @@ -97,7 +101,7 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::euclideanAlgo2(m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); @@ -110,12 +114,63 @@ struct DistanceImpl { void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::l1Impl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::chebyshevImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::hellingerImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::minkowskiImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor, metric_arg); + } +}; + +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::canberraImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); + } +}; + } // anonymous namespace /** @@ -178,11 +233,12 @@ template void distance(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true) { + FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, + InType metric_arg = 2.0f) { DistanceImpl distImpl; distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, - isRowMajor); + isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -211,13 +267,14 @@ template void distance(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true) { + cudaStream_t stream, bool isRowMajor = true, + InType metric_arg = 2.0f) { auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; distance(x, y, dist, m, n, k, workspace, worksize, default_fin_op, - stream, isRowMajor); + stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -244,12 +301,14 @@ template void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, raft::mr::device::buffer &workspace, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, + Type metric_arg = 2.0f) { auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); - distance( - x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor); + distance(x, y, dist, m, n, k, + workspace.data(), worksize, + stream, isRowMajor, metric_arg); } template @@ -257,7 +316,7 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, raft::mr::device::buffer &workspace, raft::distance::DistanceType metric, cudaStream_t stream, - bool isRowMajor = true) { + bool isRowMajor = true, Type metric_arg = 2.0f) { switch (metric) { case raft::distance::DistanceType::L2Expanded: pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; + case raft::distance::DistanceType::Linf: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; + case raft::distance::DistanceType::HellingerExpanded: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; + case raft::distance::DistanceType::LpUnexpanded: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg); + break; + case raft::distance::DistanceType::Canberra: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, stream, isRowMajor); + break; default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/hellinger.cuh new file mode 100644 index 0000000000..f7ad3ed1ba --- /dev/null +++ b/cpp/include/raft/distance/hellinger.cuh @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include + +namespace raft { +namespace distance { + +/** + * @brief the Hellinger distance matrix using the expanded form: + * It computes the following equation: + cij = sqrt(1 - sum(sqrt(x_k * y_k))) + * This distance computation modifies A and B by computing a sqrt + * and then performing a `pow(x, 2)` to convert it back. Because of this, + * it is possible that the values in A and B might differ slightly + * after this is invoked. + * + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * @tparam isRowMajor true if input/output is row major, + false for column major + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] dOutput output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + */ +template +static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + auto unaryOp_lambda = [] __device__(DataT input) { + return raft::mySqrt(input); + }; + // First sqrt x and y + raft::linalg::unaryOp( + (DataT *)x, x, m * k, unaryOp_lambda, stream); + + if (x != y) { + raft::linalg::unaryOp( + (DataT *)y, y, n * k, unaryOp_lambda, stream); + } + + // Accumulation operation lambda + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + // This is sqrt(x) * sqrt(y). + const auto product = x * y; + acc += product; + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative + const auto finalVal = (1 - acc[i][j]); + const auto rectifier = (!signbit(finalVal)); + acc[i][j] = raft::mySqrt(rectifier * finalVal); + } + } + }; + + if (isRowMajor) { + auto hellingerRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + hellingerRowMajor); + + hellingerRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { + auto hellingerColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + hellingerColMajor); + hellingerColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + // Revert sqrt of x and y + raft::linalg::unaryOp( + (DataT *)x, x, m * k, unaryOp_lambda, stream); + if (x != y) { + raft::linalg::unaryOp( + (DataT *)y, y, n * k, unaryOp_lambda, stream); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); + } else { + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + } +} + +/** + * @brief the Hellinger distance matrix calculation + * It computes the following equation: + sqrt(1 - sum(sqrt(x_k * y_k)) + * This distance computation modifies A and B by computing a sqrt + * and then performing a `pow(x, 2)` to convert it back. Because of this, + * it is possible that the values in A and B might differ slightly + * after this is invoked. + * + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ Index type + * @param m number of rows of A and C/D + * @param n number of columns of B and C/D + * @param k number of cols of A and rows of B + * @param pA input matrix + * @param pB input matrix + * @param pD output matrix + * @param fin_op the final element-wise epilogue lambda + * @param stream cuda stream where to launch work + * @param isRowMajor whether the input and output matrices are row major + */ +template +void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + hellingerOutType; + Index_ lda, ldb, ldd; + hellingerOutType *pDcast = reinterpret_cast(pD); + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + hellinger( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + + } else { + lda = n, ldb = m, ldd = m; + hellinger( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + } +} +} // namespace distance +} // namespace raft diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/minkowski.cuh new file mode 100644 index 0000000000..803f5fc78a --- /dev/null +++ b/cpp/include/raft/distance/minkowski.cuh @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +namespace raft { +namespace distance { + +/** + * @brief the unexpanded Minkowski distance matrix calculation + * It computes the following equation: cij = sum(|x - y|^p)^(1/p) + * @tparam DataT input data-type (for A and B matrices) + * @tparam AccT accumulation data-type + * @tparam OutT output data-type (for C and D matrices) + * @tparam IdxT index data-type + * @tparam Veclen number of k-elements loaded by each thread + for every LDG call. details in contractions.cuh + * @tparam FinalLambda final lambda called on final distance value + * + * @param[in] x input matrix + * @param[in] y input matrix + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and cols of C/D + * @param[in] k number of cols of A and B + * @param[in] lda leading dimension of A + * @param[in] ldb leading dimension of B + * @param[in] ldd leading dimension of C/D + * @param[output] pD output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + * @param[in] the value of `p` for Minkowski (l-p) distances. + */ +template +void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream, DataT p) { + typedef typename raft::linalg::Policy4x4::Policy RowPolicy; + typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; + + typedef + typename std::conditional::type KPolicy; + + dim3 blk(KPolicy::Nthreads); + + // Accumulation operation lambda + auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) { + const auto diff = raft::L1Op()(x - y); + acc += raft::myPow(diff, p); + }; + + // epilogue operation lambda for final value calculation + auto epilog_lambda = [p] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { + const auto one_over_p = 1.0f / p; +#pragma unroll + for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { + acc[i][j] = raft::myPow(acc[i][j], one_over_p); + } + } + }; + + if (isRowMajor) { + auto minkowskiUnExpRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + minkowskiUnExpRowMajor); + + minkowskiUnExpRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + + } else { + auto minkowskiUnExpColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + minkowskiUnExpColMajor); + + minkowskiUnExpColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } + + CUDA_CHECK(cudaGetLastError()); +} + +template +void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) { + size_t bytesA = sizeof(DataT) * lda; + size_t bytesB = sizeof(DataT) * ldb; + if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { + minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, + fin_op, stream, metric_arg); + } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { + minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, + fin_op, stream, metric_arg); + } else { + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); + } +} + +/** + * @brief the unexpanded minkowski distance matrix calculation + * It computes the following equation: cij = sum(|x - y|^p)^(1/p) + * @tparam InType input data-type (for A and B matrices) + * @tparam AccType accumulation data-type + * @tparam OutType output data-type (for C and D matrices) + * @tparam FinalLambda user-defined epilogue lamba + * @tparam Index_ index type + * @param[in] m number of rows of A and C/D + * @param[in] n number of rows of B and cols of C/D + * @param[in] k number of cols of A and B + * @param[in] pA input matrix + * @param[in] pB input matrix + * @param[out] pD output matrix + * @param[in] fin_op the final gemm epilogue lambda + * @param[in] stream cuda stream to launch work + * @param[in] isRowMajor whether the input and output matrices are row major + * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. + */ +template +void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA, + const InType *pB, OutType *pD, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + typedef std::is_same is_bool; + typedef typename std::conditional::type + LpUnexpOutType; + LpUnexpOutType *pDcast = reinterpret_cast(pD); + Index_ lda, ldb, ldd; + + if (isRowMajor) { + lda = k, ldb = k, ldd = n; + minkowskiUnExp( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream, metric_arg); + } else { + lda = n, ldb = m, ldd = m; + minkowskiUnExp( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream, metric_arg); + } +} + +}; // end namespace distance +}; // end namespace raft diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index c590abb142..aa711a9140 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -338,6 +338,7 @@ struct Contractions_NT { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; +#pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset); @@ -351,6 +352,7 @@ struct Contractions_NT { } else { const auto numRows = k; auto koffset = scolid; +#pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { @@ -369,6 +371,7 @@ struct Contractions_NT { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; +#pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset); @@ -382,6 +385,7 @@ struct Contractions_NT { } else { auto numRows = k; auto koffset = scolid; +#pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0dec4be833..f94a8d9525 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -19,10 +19,14 @@ add_executable(test_raft test/cudart_utils.cpp test/cluster_solvers.cu test/distance/dist_adj.cu + test/distance/dist_canberra.cu + test/distance/dist_chebyshev.cu test/distance/dist_cos.cu test/distance/dist_euc_exp.cu test/distance/dist_euc_unexp.cu + test/distance/dist_hellinger.cu test/distance/dist_l1.cu + test/distance/dist_minkowski.cu test/distance/fused_l2_nn.cu test/eigen_solvers.cu test/handle.cpp diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu new file mode 100644 index 0000000000..10bc4d1899 --- /dev/null +++ b/cpp/test/distance/dist_canberra.cu @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceCanberra + : public DistanceTest {}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceCanberra DistanceCanberraF; +TEST_P(DistanceCanberraF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceCanberra DistanceCanberraD; +TEST_P(DistanceCanberraD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu new file mode 100644 index 0000000000..6a2b02863a --- /dev/null +++ b/cpp/test/distance/dist_chebyshev.cu @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceLinf + : public DistanceTest {}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceLinf DistanceLinfF; +TEST_P(DistanceLinfF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceLinf DistanceLinfD; +TEST_P(DistanceLinfD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu new file mode 100644 index 0000000000..39dc7aaeff --- /dev/null +++ b/cpp/test/distance/dist_hellinger.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceHellingerExp + : public DistanceTest {}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceHellingerExp DistanceHellingerExpF; +TEST_P(DistanceHellingerExpF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001, 1024, 1024, 32, false, 1234ULL}, + {0.001, 1024, 32, 1024, false, 1234ULL}, + {0.001, 32, 1024, 1024, false, 1234ULL}, + {0.003, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceHellingerExp DistanceHellingerExpD; +TEST_P(DistanceHellingerExpD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu new file mode 100644 index 0000000000..42b8e294ac --- /dev/null +++ b/cpp/test/distance/dist_minkowski.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceLpUnexp + : public DistanceTest { +}; + +const std::vector> inputsf = { + {0.001f, 1024, 1024, 32, true, 1234ULL, 4.0f}, + {0.001f, 1024, 32, 1024, true, 1234ULL, 3.0f}, + {0.001f, 32, 1024, 1024, true, 1234ULL, 4.0f}, + {0.003f, 1024, 1024, 1024, true, 1234ULL, 3.0f}, + {0.001f, 1024, 1024, 32, false, 1234ULL, 4.0f}, + {0.001f, 1024, 32, 1024, false, 1234ULL, 3.0f}, + {0.001f, 32, 1024, 1024, false, 1234ULL, 4.0f}, + {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f}, +}; +typedef DistanceLpUnexp DistanceLpUnexpF; +TEST_P(DistanceLpUnexpF, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, + ::testing::ValuesIn(inputsf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL, 4.0}, + {0.001, 1024, 32, 1024, true, 1234ULL, 3.0}, + {0.001, 32, 1024, 1024, true, 1234ULL, 4.0}, + {0.003, 1024, 1024, 1024, true, 1234ULL, 3.0}, + {0.001, 1024, 1024, 32, false, 1234ULL, 4.0}, + {0.001, 1024, 32, 1024, false, 1234ULL, 3.0}, + {0.001, 32, 1024, 1024, false, 1234ULL, 4.0}, + {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0}, +}; +typedef DistanceLpUnexp DistanceLpUnexpD; +TEST_P(DistanceLpUnexpD, Result) { + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, + ::testing::ValuesIn(inputsd)); + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index d6f06c186a..fc7b064205 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -47,9 +47,9 @@ __global__ void naiveDistanceKernel(DataType *dist, const DataType *x, } template -__global__ void naiveL1DistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, int k, - bool isRowMajor) { +__global__ void naiveL1_Linf_CanberraDistanceKernel( + DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, + raft::distance::DistanceType type, bool isRowMajor) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) { @@ -63,7 +63,16 @@ __global__ void naiveL1DistanceKernel(DataType *dist, const DataType *x, auto a = x[xidx]; auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); - acc += diff; + if (type == raft::distance::DistanceType::Linf) { + acc = raft::myMax(acc, diff); + } else if (type == raft::distance::DistanceType::Canberra) { + const auto add = raft::myAbs(a) + raft::myAbs(b); + // deal with potential for 0 in denominator by + // forcing 1/0 instead + acc += ((add != 0) * diff / (add + (add == 0))); + } else { + acc += diff; + } } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; @@ -101,17 +110,69 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); } +template +__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) { + return; + } + + DataType acc_ab = DataType(0); + + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + acc_ab += raft::mySqrt(a) * raft::mySqrt(b); + } + + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + + // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative + acc_ab = 1 - acc_ab; + auto rectifier = (!signbit(acc_ab)); + dist[outidx] = raft::mySqrt(rectifier * acc_ab); +} + +template +__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor, DataType p) { + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) return; + DataType acc = DataType(0); + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + auto diff = raft::L1Op()(a - b); + acc += raft::myPow(diff, p); + } + auto one_over_p = 1 / p; + acc = raft::myPow(acc, one_over_p); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; +} + template void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, raft::distance::DistanceType type, - bool isRowMajor) { + bool isRowMajor, DataType metric_arg = 2.0f) { static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); switch (type) { + case raft::distance::DistanceType::Canberra: + case raft::distance::DistanceType::Linf: case raft::distance::DistanceType::L1: - naiveL1DistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveL1_Linf_CanberraDistanceKernel + <<>>(dist, x, y, m, n, k, type, isRowMajor); break; case raft::distance::DistanceType::L2SqrtUnexpanded: case raft::distance::DistanceType::L2Unexpanded: @@ -124,6 +185,14 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, naiveCosineDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor); break; + case raft::distance::DistanceType::HellingerExpanded: + naiveHellingerDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); + break; + case raft::distance::DistanceType::LpUnexpanded: + naiveLpUnexpDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); + break; default: FAIL() << "should be here\n"; } @@ -136,6 +205,7 @@ struct DistanceInputs { int m, n, k; bool isRowMajor; unsigned long long int seed; + DataType metric_arg = 2.0f; }; template @@ -148,13 +218,15 @@ template void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2, int m, int n, int k, DistanceInputs ¶ms, DataType threshold, char *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor) { + cudaStream_t stream, bool isRowMajor, + DataType metric_arg = 2.0f) { auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) { dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val; return d_val; }; raft::distance::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, + metric_arg); } template @@ -166,6 +238,7 @@ class DistanceTest : public ::testing::TestWithParam> { int m = params.m; int n = params.n; int k = params.k; + DataType metric_arg = params.metric_arg; bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -174,9 +247,17 @@ class DistanceTest : public ::testing::TestWithParam> { raft::allocate(dist_ref, m * n); raft::allocate(dist, m * n); raft::allocate(dist2, m * n); - r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); - r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); - naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor); + if (distanceType == raft::distance::DistanceType::HellingerExpanded) { + // Hellinger works only on positive numbers + r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream); + r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream); + } else { + r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream); + r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); + } + + naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, + metric_arg); char *workspace = nullptr; size_t worksize = raft::distance::getWorkspaceSize> { DataType threshold = -10000.f; distanceLauncher(x, y, dist, dist2, m, n, k, params, threshold, workspace, worksize, - stream, isRowMajor); + stream, isRowMajor, metric_arg); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); }