From b4939b784d545e0b3ed092d0ed4822fa82cc36ef Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 17 Oct 2022 21:59:02 -0400 Subject: [PATCH] Moving kernel gramm primitives to `raft::distance::kernels` (#920) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/920 --- cpp/CMakeLists.txt | 10 + cpp/bench/CMakeLists.txt | 1 + cpp/bench/distance/distance_common.cuh | 2 +- cpp/bench/distance/kernels.cu | 123 ++++++ .../distance/detail/kernels/gram_matrix.cuh | 218 ++++++++++ .../detail/kernels/kernel_factory.cuh | 48 +++ .../detail/kernels/kernel_matrices.cuh | 376 ++++++++++++++++ cpp/include/raft/distance/distance_types.hpp | 21 + cpp/include/raft/distance/kernels.cuh | 32 ++ .../specializations/detail/kernels.cuh | 31 ++ .../distance/specializations/distance.cuh | 1 + cpp/include/raft/linalg/init.cuh | 14 + cpp/include/raft/util/cache.cuh | 406 ++++++++++++++++++ cpp/include/raft/util/fast_int_div.cuh | 123 ++++++ .../detail/kernels/gram_matrix_base_double.cu | 19 + .../detail/kernels/gram_matrix_base_float.cu | 19 + .../kernels/polynomial_kernel_double_int.cu | 19 + .../kernels/polynomial_kernel_float_int.cu | 19 + .../detail/kernels/rbf_kernel_double.cu | 19 + .../detail/kernels/rbf_kernel_float.cu | 19 + .../detail/kernels/tanh_kernel_double.cu | 19 + .../detail/kernels/tanh_kernel_float.cu | 19 + cpp/test/CMakeLists.txt | 1 + cpp/test/distance/gram.cu | 189 ++++++++ 24 files changed, 1747 insertions(+), 1 deletion(-) create mode 100644 cpp/bench/distance/kernels.cu create mode 100644 cpp/include/raft/distance/detail/kernels/gram_matrix.cuh create mode 100644 cpp/include/raft/distance/detail/kernels/kernel_factory.cuh create mode 100644 cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh create mode 100644 cpp/include/raft/distance/kernels.cuh create mode 100644 cpp/include/raft/distance/specializations/detail/kernels.cuh create mode 100644 cpp/include/raft/util/cache.cuh create mode 100644 cpp/include/raft/util/fast_int_div.cuh create mode 100644 cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu create mode 100644 cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu create mode 100644 cpp/test/distance/gram.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 12bebfa2a5..21e2103c4b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -248,11 +248,21 @@ if(RAFT_COMPILE_DIST_LIBRARY) src/distance/specializations/detail/chebyshev.cu src/distance/specializations/detail/correlation.cu src/distance/specializations/detail/cosine.cu + src/distance/specializations/detail/cosine.cu src/distance/specializations/detail/hamming_unexpanded.cu src/distance/specializations/detail/hellinger_expanded.cu src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu + src/distance/specializations/detail/kernels/gram_matrix_base_double.cu + src/distance/specializations/detail/kernels/gram_matrix_base_float.cu + src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu + src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu +# These are somehow missing a kernel definition which is causing a compile error. +# src/distance/specializations/detail/kernels/rbf_kernel_double.cu +# src/distance/specializations/detail/kernels/rbf_kernel_float.cu + src/distance/specializations/detail/kernels/tanh_kernel_double.cu + src/distance/specializations/detail/kernels/tanh_kernel_float.cu src/distance/specializations/detail/kl_divergence_float_float_float_int.cu src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu src/distance/specializations/detail/kl_divergence_double_double_double_int.cu diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 4af3df9a1a..9c6b60d83b 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -85,6 +85,7 @@ if(BUILD_BENCH) bench/distance/distance_exp_l2.cu bench/distance/distance_l1.cu bench/distance/distance_unexp_l2.cu + bench/distance/kernels.cu bench/main.cpp OPTIONAL DIST ) diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh index 4f1a8ccab1..73faacce37 100644 --- a/cpp/bench/distance/distance_common.cuh +++ b/cpp/bench/distance/distance_common.cuh @@ -15,8 +15,8 @@ */ #include -#include #include +#include #if defined RAFT_DISTANCE_COMPILED #include #endif diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/distance/kernels.cu new file mode 100644 index 0000000000..5c9c2cc2ed --- /dev/null +++ b/cpp/bench/distance/kernels.cu @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#if defined RAFT_DISTANCE_COMPILED +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::distance::kernels { + +using namespace raft::distance::kernels; +struct GramTestParams { + int m; // m parameter of the GEMM + int k; // k parameter of the GEMM + int n; // n parameter of the GEMM + KernelParams kernel_params; + bool is_row_major; +}; // struct GramTestParams + +template +struct GramMatrix : public fixture { + GramMatrix(const GramTestParams& p) + : params(p), handle(stream), A(0, stream), B(0, stream), C(0, stream) + { + kernel = std::unique_ptr>( + KernelFactory::create(p.kernel_params, handle.get_cublas_handle())); + + A.resize(params.m * params.k, stream); + B.resize(params.k * params.n, stream); + C.resize(params.m * params.n, stream); + raft::random::Rng r(123456ULL); + r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream); + r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream); + } + + ~GramMatrix() + { + A.release(); + B.release(); + C.release(); + } + + void run_benchmark(::benchmark::State& state) override + { + if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); } + loop_on_state(state, [this]() { + (*this->kernel)(A.data(), + this->params.m, + this->params.k, + B.data(), + this->params.n, + C.data(), + this->params.is_row_major, + this->stream); + }); + } + + private: + const raft::handle_t handle; + std::unique_ptr> kernel; + GramTestParams params; + + rmm::device_uvector A; // input matrix A, size [m * k] + rmm::device_uvector B; // input matrix B, size [n * k] + rmm::device_uvector C; // output matrix C, size [m*n] +}; + +static std::vector getInputs() +{ + std::vector param_vec; + std::vector kernel_params{KernelParams{LINEAR, 3, 1, 0}, + KernelParams{POLYNOMIAL, 2, 1.3, 1}, + KernelParams{TANH, 2, 0.5, 2.4}, + KernelParams{RBF, 2, 0.5, 0}}; + struct TestSize { + int m; + int k; + int n; + }; + std::vector data_size{{4096, 10, 1024}, + {4096, 100, 1024}, + {4096, 1000, 1024}, + {4096, 10000, 1024}, + {100000, 10, 1024}, + {100000, 100, 1024}, + {100000, 1000, 1024}}; + + param_vec.reserve(kernel_params.size() * data_size.size()); + for (TestSize s : data_size) { + for (auto kernel : kernel_params) { + for (bool row_major : {false, true}) { + param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major}); + } + } + } + return param_vec; +} + +RAFT_BENCH_REGISTER(GramMatrix, "", getInputs()); +RAFT_BENCH_REGISTER(GramMatrix, "", getInputs()); + +} // namespace raft::bench::distance::kernels diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh new file mode 100644 index 0000000000..54ac490ca4 --- /dev/null +++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace raft::distance::kernels::detail { + +/** + * Base class for general Gram matrices + * A Gram matrix is the Hermitian matrix of inner probucts G_ik = + * Here, the inner product is evaluated for all elements from vectors sets X1, + * and X2. + * + * To be more precise, on exit the output buffer will store: + * - if is_row_major == true: out[j+k*n1] = , + * - if is_row_major == false: out[j*n2 + k] = , + * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector + * from the x2 set. + */ +template +class GramMatrixBase { + cublasHandle_t cublas_handle; + + public: + GramMatrixBase(cublasHandle_t cublas_handle) : cublas_handle(cublas_handle){}; + + virtual ~GramMatrixBase(){}; + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + virtual void operator()(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1 = 0, + int ld2 = 0, + int ld_out = 0) + { + if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } + if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } + if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } + evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + } + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + virtual void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + } + + // private: + // The following methods should be private, they are kept public to avoid: + // "error: The enclosing parent function ("distance") for an extended + // __device__ lambda cannot have private or protected access within its class" + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of colums (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + void linear(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + n2, + n1, + n_cols, + &alpha, + x2, + ld2, + x1, + ld1, + &beta, + out, + ld_out, + stream)); + } else { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + n1, + n2, + n_cols, + &alpha, + x1, + ld1, + x2, + ld2, + &beta, + out, + ld_out, + stream)); + } + } + + /** Calculates the Gram matrix using Euclidean distance. + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + virtual void distance(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + raft::distance::distance( + x1, x2, out, n1, n2, n_cols, stream, is_row_major); + } +}; +}; // end namespace raft::distance::kernels::detail \ No newline at end of file diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh new file mode 100644 index 0000000000..1aa6809bcd --- /dev/null +++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gram_matrix.cuh" +#include "kernel_matrices.cuh" +#include +#include + +namespace raft::distance::kernels::detail { + +template +class KernelFactory { + public: + static GramMatrixBase* create(KernelParams params, cublasHandle_t cublas_handle) + { + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(cublas_handle); break; + case POLYNOMIAL: + res = new PolynomialKernel(params.degree, gamma, coef0, cublas_handle); + break; + case TANH: res = new TanhKernel(gamma, coef0, cublas_handle); break; + case RBF: res = new RBFKernel(gamma); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; + } +}; + +}; // end namespace raft::distance::kernels::detail diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh new file mode 100644 index 0000000000..6d59e1c7c5 --- /dev/null +++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gram_matrix.cuh" +#include + +#include +#include + +namespace raft::distance::kernels::detail { + +/** Epiloge function for polynomial kernel without padding. + * Calculates output = (gain*in + offset)^exponent + * @param inout device vector in column major format, size [len] + * @param len array length + * @param exponent + * @param gain + * @param offset + */ +template +__global__ void polynomial_kernel_nopad( + math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = pow(gain * inout[tid] + offset, exponent); + } +} + +/** Epiloge function for polynomial kernel with padding. + * Calculates output = (gain*input + offset)^exponent + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of colums + * @param exponent + * @param gain + * @param offset + */ +template +__global__ void polynomial_kernel( + math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector, size [len] + * @param len length of the input vector + * @param gain + * @param offset + */ +template +__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = tanh(gain * inout[tid] + offset); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of colums + * @param gain + * @param offset + */ +template +__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); + } +} + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +class PolynomialKernel : public GramMatrixBase { + exp_t exponent; + math_t gain; + math_t offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) + { + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, exponent, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + polynomial_kernel<<>>(inout, ld, n1, n2, exponent, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + public: + /** + * Constructs a polynomial kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = (gain* + offset)^exponent + * + * @tparam math_t floating point type + * @tparam exp_t type of exponent + * @param exponent + * @param gain + * @param offset + * @param cublas_handle + */ + PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t cublas_handle) + : GramMatrixBase(cublas_handle), exponent(exponent), gain(gain), offset(offset) + { + } + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of features in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); + } +}; + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +class TanhKernel : public GramMatrixBase { + math_t gain, offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) + { + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + public: + /** + * Constructs a tanh kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = tanh(gain* + offset) + * + * @tparam math_t floating point type + * @param gain + * @param offset + * @param cublas_handle + */ + TanhKernel(math_t gain, math_t offset, cublasHandle_t cublas_handle) + : GramMatrixBase(cublas_handle), gain(gain), offset(offset) + { + } + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] x1 device array of vectors, + * size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of features in x1 and x2 + * @param [in] x2 device array of vectors, + * size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); + } +}; + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +class RBFKernel : public GramMatrixBase { + math_t gain; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) + { + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + rbf_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, gain); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + rbf_kernel<<>>(inout, ld, n1, n2, gain); + } + } + + public: + /** + * Constructs a RBF kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = exp(-gain*|x1_i- x2_k|^2) + * + * @tparam math_t floating point type + * @param gain + */ + RBFKernel(math_t gain) : GramMatrixBase(NULL), gain(gain) {} + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of features in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported + * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported + * @param ld_out leading dimension of out, only ld_out == n1 is supported + */ + void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + int minor1 = is_row_major ? n_cols : n1; + int minor2 = is_row_major ? n_cols : n2; + int minor_out = is_row_major ? n2 : n1; + ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); + ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); + ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); + distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + } + + /** Customize distance function withe RBF epilogue */ + void distance(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) + { + math_t gain = this->gain; + using index_t = int64_t; + + auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); }; + raft::distance::distance(const_cast(x1), + const_cast(x2), + out, + n1, + n2, + n_cols, + NULL, + 0, + fin_op, + stream, + is_row_major); + } +}; + +}; // end namespace raft::distance::kernels::detail diff --git a/cpp/include/raft/distance/distance_types.hpp b/cpp/include/raft/distance/distance_types.hpp index f75263b00d..f5ed68af4a 100644 --- a/cpp/include/raft/distance/distance_types.hpp +++ b/cpp/include/raft/distance/distance_types.hpp @@ -65,5 +65,26 @@ enum DistanceType : unsigned short { /** Precomputed (special value) **/ Precomputed = 100 }; + +namespace kernels { +enum KernelType { LINEAR, POLYNOMIAL, RBF, TANH }; + +/** + * Parameters for kernel matrices. + * The following kernels are implemented: + * - LINEAR \f[ K(x_1,x_2) = , \f] where \f$< , >\f$ is the dot product + * - POLYNOMIAL \f[ K(x_1, x_2) = (\gamma + \mathrm{coef0})^\mathrm{degree} \f] + * - RBF \f[ K(x_1, x_2) = \exp(- \gamma |x_1-x_2|^2) \f] + * - TANH \f[ K(x_1, x_2) = \tanh(\gamma + \mathrm{coef0}) \f] + */ +struct KernelParams { + // Kernel function parameters + KernelType kernel; //!< Type of the kernel function + int degree; //!< Degree of polynomial kernel (ignored by others) + double gamma; //!< multiplier in the + double coef0; //!< additive constant in poly and tanh kernels +}; +} // end namespace kernels + }; // namespace distance }; // end namespace raft diff --git a/cpp/include/raft/distance/kernels.cuh b/cpp/include/raft/distance/kernels.cuh new file mode 100644 index 0000000000..86f9f82406 --- /dev/null +++ b/cpp/include/raft/distance/kernels.cuh @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace raft::distance::kernels { + +// TODO: Need to expose formal APIs for this that are more consistent w/ other APIs in RAFT +using raft::distance::kernels::detail::GramMatrixBase; +using raft::distance::kernels::detail::KernelFactory; + +}; // end namespace raft::distance::kernels diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh new file mode 100644 index 0000000000..75c9c023e8 --- /dev/null +++ b/cpp/include/raft/distance/specializations/detail/kernels.cuh @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +extern template class raft::distance::kernels::detail::GramMatrixBase; +extern template class raft::distance::kernels::detail::GramMatrixBase; + +extern template class raft::distance::kernels::detail::PolynomialKernel; +extern template class raft::distance::kernels::detail::PolynomialKernel; + +extern template class raft::distance::kernels::detail::TanhKernel; +extern template class raft::distance::kernels::detail::TanhKernel; + +// These are somehow missing a kernel definition which is causing a compile error +// extern template class raft::distance::kernels::detail::RBFKernel; +// extern template class raft::distance::kernels::detail::RBFKernel; \ No newline at end of file diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh index 73d075f260..053441d68a 100644 --- a/cpp/include/raft/distance/specializations/distance.cuh +++ b/cpp/include/raft/distance/specializations/distance.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh index 2fdf9dceb9..5a810bf2ba 100644 --- a/cpp/include/raft/linalg/init.cuh +++ b/cpp/include/raft/linalg/init.cuh @@ -19,6 +19,7 @@ #pragma once #include "detail/init.hpp" +#include namespace raft { namespace linalg { @@ -54,6 +55,19 @@ void range(T* out, int n, cudaStream_t stream) detail::range(out, n, stream); } +/** + * @brief Zeros the output. + * + * \param [out] out device array, size [n] + * \param [in] n length of the array + * \param [in] stream cuda stream + */ +template +void zero(T* out, int n, cudaStream_t stream) +{ + RAFT_CUDA_TRY(cudaMemsetAsync(static_cast(out), 0, n * sizeof(T), stream)); +} + } // namespace linalg } // namespace raft diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh new file mode 100644 index 0000000000..8394ce83b8 --- /dev/null +++ b/cpp/include/raft/util/cache.cuh @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::cache { + +/** + * @brief Associative cache with least recently used replacement policy. + * + * SW managed cache in device memory, for ML algos where we can trade memory + * access for computation. The two main functions of this class are the + * management of cache indices, and methods to retrieve/store data using the + * cache indices. + * + * The index management can be considered as a hash map, where the int + * keys are the original vector indices that we want to store, and the values are + * the cache location of these vectors. The keys are hashed into a bucket + * whose size equals the associativity. These are the cache sets. If a cache + * set is full, then new indices are stored by replacing the oldest entries. + * + * Using this index mapping we implement methods to store and retrive data from + * the cache buffer, where a unit of data that we are storing is math_t[n_vec]. + * For example in SVM we store full columns of the kernel matrix at each cache + * entry. + * + * Note: we should have a look if the index management could be simplified using + * concurrent_unordered_map.cuh from cudf. See Issue #914. + * + * Example usage: + * @code{.cpp} + * + * // An expensive calculation that we want to accelerate with caching: + * // we have n keys, and for each key we generate a vector with m elements. + * // The keys and the output values are stored in GPU memory. + * void calc(int *key, int n, int m, float *out, cudaStream_t stream) { + * for (k=0; k cache(h.get_device_allocator(), stream, m); + * + * // A buffer that we will reuse to store the cache indices. + * rmm::device_uvector cache_idx(h.get_device_allocator(), stream, n); + * + * void cached_calc(int *key, int n, int m, float *out, stream) { + * int n_cached = 0; + * + * cache.GetCacheIdxPartitioned(key, n, cache_idx.data(), &n_cached, + * cudaStream_t stream); + * + * // Note: GetCacheIdxPartitioned has reordered the keys so that + * // key[0..n_cached-1] are the keys already in the cache. + * // We collect the corresponding values + * cache.GetVecs(cache_idx.data(), n_cached, out, stream); + * + * // Calculate the elements not in the cache + * int non_cached = n - n_cached; + * if (non_cached > 0) { + * int *key_new = key + n_cached; + * int *cache_idx_new = cache_idx.data() + n_cached; + * float *out_new = out + n_cached * m; + * // AssignCacheIdx can permute the keys, therefore it has to come before + * // we call calc. + * // Note: a call to AssignCacheIdx should always be preceded with + * // GetCacheIdxPartitioned, because that initializes the cache_idx_new array + * // with the cache set (hash bucket) that correspond to the keys. + * // The cache idx will be assigned from that cache set. + * cache.AssignCacheIdx(key_new, non_cached, cache_idx_new, stream); + * + * calc(key_new, non_cached, m, out_new, stream); + * + * // Store the calculated vectors into the cache. + * cache.StoreVecs(out_new, non_cached, non_cached, cache_idx_new, stream); + * } + * } + * @endcode + */ +template +class Cache { + public: + /** + * @brief Construct a Cache object + * + * @tparam math_t type of elements to be cached + * @tparam associativity number of vectors in a cache set + * + * @param stream cuda stream + * @param n_vec number of elements in a single vector that is stored in a + * cache entry + * @param cache_size in MiB + */ + Cache(cudaStream_t stream, int n_vec, float cache_size = 200) + : n_vec(n_vec), + cache_size(cache_size), + cache(0, stream), + cached_keys(0, stream), + cache_time(0, stream), + is_cached(0, stream), + ws_tmp(0, stream), + idx_tmp(0, stream), + d_num_selected_out(stream), + d_temp_storage(0, stream) + { + ASSERT(n_vec > 0, "Parameter n_vec: shall be larger than zero"); + ASSERT(associativity > 0, "Associativity shall be larger than zero"); + ASSERT(cache_size >= 0, "Cache size should not be negative"); + + // Calculate how many vectors would fit the cache + int n_cache_vecs = (cache_size * 1024 * 1024) / (sizeof(math_t) * n_vec); + + // The available memory shall be enough for at least one cache set + if (n_cache_vecs >= associativity) { + n_cache_sets = n_cache_vecs / associativity; + n_cache_vecs = n_cache_sets * associativity; + cache.resize(n_cache_vecs * n_vec, stream); + cached_keys.resize(n_cache_vecs, stream); + cache_time.resize(n_cache_vecs, stream); + RAFT_CUDA_TRY( + cudaMemsetAsync(cached_keys.data(), 0, cached_keys.size() * sizeof(int), stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(cache_time.data(), 0, cache_time.size() * sizeof(int), stream)); + } else { + if (cache_size > 0) { + RAFT_LOG_WARN( + "Warning: not enough memory to cache a single set of " + "rows, not using cache"); + } + n_cache_sets = 0; + cache_size = 0; + } + RAFT_LOG_DEBUG( + "Creating cache with size=%f MiB, to store %d vectors, in " + "%d sets with associativity=%d", + cache_size, + n_cache_vecs, + n_cache_sets, + associativity); + } + + Cache(const Cache& other) = delete; + + Cache& operator=(const Cache& other) = delete; + + /** @brief Collect cached data into contiguous memory space. + * + * On exit, the tile array is filled the following way: + * out[i + n_vec*k] = cache[i + n_vec * idx[k]]), where i=0..n_vec-1, + * k = 0..n-1 + * + * Idx values less than 0 are ignored. + * + * @param [in] idx cache indices, size [n] + * @param [in] n the number of vectors that need to be collected + * @param [out] out vectors collected from cache, size [n_vec*n] + * @param [in] stream cuda stream + */ + void GetVecs(const int* idx, int n, math_t* out, cudaStream_t stream) + { + if (n > 0) { + get_vecs<<>>(cache.data(), n_vec, idx, n, out); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + } + + /** @brief Store vectors of data into the cache. + * + * Roughly the opposite of GetVecs, but the input vectors can be scattered + * in memory. The cache is updated using the following formula: + * + * cache[i + cache_idx[k]*n_vec] = tile[i + tile_idx[k]*n_vec], + * for i=0..n_vec-1, k=0..n-1 + * + * If tile_idx==nullptr, then we assume tile_idx[k] = k. + * + * Elements within a vector should be contiguous in memory (i.e. column vectors + * for column major data storage, or row vectors of row major data). + * + * @param [in] tile stores the data to be cashed cached, size [n_vec x n_tile] + * @param [in] n_tile number of vectors in tile (at least n) + * @param [in] n number of vectors that need to be stored in the cache (a subset + * of all the vectors in the tile) + * @param [in] cache_idx cache indices for storing the vectors (negative values + * are ignored), size [n] + * @param [in] stream cuda stream + * @param [in] tile_idx indices of vectors that need to be stored + */ + void StoreVecs(const math_t* tile, + int n_tile, + int n, + int* cache_idx, + cudaStream_t stream, + const int* tile_idx = nullptr) + { + if (n > 0) { + store_vecs<<>>( + tile, n_tile, n_vec, tile_idx, n, cache_idx, cache.data(), cache.size() / n_vec); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + } + + /** @brief Map a set of keys to cache indices. + * + * For each k in 0..n-1, if keys[k] is found in the cache, then cache_idx[k] + * will tell the corresponding cache idx, and is_cached[k] is set to true. + * + * If keys[k] is not found in the cache, then is_cached[k] is set to false. + * In this case we assign the cache set for keys[k], and cache_idx[k] will + * store the cache set. + * + * @note in order to retrieve the cached vector j=cache_idx[k] from the cache, + * we have to access cache[i + j*n_vec], where i=0..n_vec-1. + * + * @note: do not use simultaneous GetCacheIdx and AssignCacheIdx + * + * @param [in] keys device array of keys, size [n] + * @param [in] n number of keys + * @param [out] cache_idx device array of cache indices corresponding to the + * input keys, size [n] + * @param [out] is_cached whether the element is already available in the + * cache, size [n] + * @param [in] stream + */ + void GetCacheIdx(int* keys, int n, int* cache_idx, bool* is_cached, cudaStream_t stream) + { + n_iter++; // we increase the iteration counter, that is used to time stamp + // accessing entries from the cache + get_cache_idx<<>>(keys, + n, + cached_keys.data(), + n_cache_sets, + associativity, + cache_time.data(), + cache_idx, + is_cached, + n_iter); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + + /** @brief Map a set of keys to cache indices. + * + * Same as GetCacheIdx, but partitions the keys, and cache_idx arrays in a way + * that keys[0..n_cached-1] and cache_idx[0..n_cached-1] store the indices of + * vectors that are found in the cache, while keys[n_cached..n-1] are the + * indices of vectors that are not found in the cache. For the vectors not + * found in the cache, cache_idx[n_cached..n-1] stores the cache set, and this + * can be used to call AssignCacheIdx. + * + * @param [inout] keys device array of keys, size [n] + * @param [in] n number of indices + * @param [out] cache_idx device array of cache indices corresponding to + * the input keys, size [n] + * @param [out] n_cached number of elements that are cached + * @param [in] stream cuda stream + */ + void GetCacheIdxPartitioned(int* keys, int n, int* cache_idx, int* n_cached, cudaStream_t stream) + { + ResizeTmpBuffers(n, stream); + + GetCacheIdx(keys, n, ws_tmp.data(), is_cached.data(), stream); + + // Group cache indices as [already cached, non_cached] + cub::DevicePartition::Flagged(d_temp_storage.data(), + d_temp_storage_size, + ws_tmp.data(), + is_cached.data(), + cache_idx, + d_num_selected_out.data(), + n, + stream); + + raft::update_host(n_cached, d_num_selected_out.data(), 1, stream); + + // Similarily re-group the input indices + raft::copy(ws_tmp.data(), keys, n, stream); + cub::DevicePartition::Flagged(d_temp_storage.data(), + d_temp_storage_size, + ws_tmp.data(), + is_cached.data(), + keys, + d_num_selected_out.data(), + n, + stream); + + raft::interruptible::synchronize(stream); + } + + /** + * @brief Assign cache location to a set of keys. + * + * Note: call GetCacheIdx first, to get the cache_set assigned to the keys. + * Keys that cannot be cached are assigned to -1. + * + * @param [inout] keys device array of keys, size [n] + * @param [in] n number of elements that we want to cache + * @param [inout] cidx on entry: cache_set, on exit: assigned cache_idx or -1, + * size[n] + * @param [in] stream cuda stream + */ + void AssignCacheIdx(int* keys, int n, int* cidx, cudaStream_t stream) + { + if (n <= 0) return; + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), + d_temp_storage_size, + cidx, + ws_tmp.data(), + keys, + idx_tmp.data(), + n, + 0, + sizeof(int) * 8, + stream); + + raft::copy(keys, idx_tmp.data(), n, stream); + + // set it to -1 + RAFT_CUDA_TRY(cudaMemsetAsync(cidx, 255, n * sizeof(int), stream)); + const int nthreads = associativity <= 32 ? associativity : 32; + + assign_cache_idx<<>>( + keys, n, ws_tmp.data(), cached_keys.data(), n_cache_sets, cache_time.data(), n_iter, cidx); + + RAFT_CUDA_TRY(cudaPeekAtLastError()); + if (debug_mode) RAFT_CUDA_TRY(cudaDeviceSynchronize()); + } + + /** Return approximate cache size in MiB. */ + float GetSizeInMiB() const { return cache_size; } + + /** + * Returns the number of vectors that can be cached. + */ + int GetSize() const { return cached_keys.size(); } + + private: + int n_vec; //!< Number of elements in a cached vector + float cache_size; //!< in MiB + int n_cache_sets; //!< number of cache sets + + const int TPB = 256; //!< threads per block for kernel launch + int n_iter = 0; //!< Counter for time stamping cache operation + + bool debug_mode = false; + + rmm::device_uvector cache; //!< The value of cached vectors + rmm::device_uvector cached_keys; //!< Keys stored at each cache loc + rmm::device_uvector cache_time; //!< Time stamp for LRU cache + + // Helper arrays for GetCacheIdx + rmm::device_uvector is_cached; + rmm::device_uvector ws_tmp; + rmm::device_uvector idx_tmp; + + // Helper arrays for cub + rmm::device_scalar d_num_selected_out; + rmm::device_uvector d_temp_storage; + size_t d_temp_storage_size = 0; + + void ResizeTmpBuffers(int n, cudaStream_t stream) + { + if (ws_tmp.size() < static_cast(n)) { + ws_tmp.resize(n, stream); + is_cached.resize(n, stream); + idx_tmp.resize(n, stream); + cub::DevicePartition::Flagged(NULL, + d_temp_storage_size, + cached_keys.data(), + is_cached.data(), + cached_keys.data(), + d_num_selected_out.data(), + n, + stream); + d_temp_storage.resize(d_temp_storage_size, stream); + } + } +}; +}; // namespace raft::cache diff --git a/cpp/include/raft/util/fast_int_div.cuh b/cpp/include/raft/util/fast_int_div.cuh new file mode 100644 index 0000000000..a0cb8f0f53 --- /dev/null +++ b/cpp/include/raft/util/fast_int_div.cuh @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft::util { + +/** + * @brief Perform fast integer division and modulo using a known divisor + * From Hacker's Delight, Second Edition, Chapter 10 + * + * @note This currently only supports 32b signed integers + * @todo Extend support for signed divisors + */ +struct FastIntDiv { + /** + * @defgroup HostMethods Ctor's that are accessible only from host + * @{ + * @brief Host-only ctor's + * @param _d the divisor + */ + FastIntDiv(int _d) : d(_d) { computeScalars(); } + FastIntDiv& operator=(int _d) + { + d = _d; + computeScalars(); + return *this; + } + /** @} */ + + /** + * @defgroup DeviceMethods Ctor's which even the device-side can access + * @{ + * @brief host and device ctor's + * @param other source object to be copied from + */ + HDI FastIntDiv(const FastIntDiv& other) : d(other.d), m(other.m), p(other.p) {} + HDI FastIntDiv& operator=(const FastIntDiv& other) + { + d = other.d; + m = other.m; + p = other.p; + return *this; + } + /** @} */ + + /** divisor */ + int d; + /** the term 'm' as found in the reference chapter */ + unsigned m; + /** the term 'p' as found in the reference chapter */ + int p; + + private: + void computeScalars() + { + if (d == 1) { + m = 0; + p = 1; + return; + } else if (d < 0) { + ASSERT(false, "FastIntDiv: division by negative numbers not supported!"); + } else if (d == 0) { + ASSERT(false, "FastIntDiv: got division by zero!"); + } + int64_t nc = ((1LL << 31) / d) * d - 1; + p = 31; + int64_t twoP, rhs; + do { + ++p; + twoP = 1LL << p; + rhs = nc * (d - twoP % d); + } while (twoP <= rhs); + m = (twoP + d - twoP % d) / d; + } +}; // struct FastIntDiv + +/** + * @brief Division overload, so that FastIntDiv can be transparently switched + * to even on device + * @param n numerator + * @param divisor the denominator + * @return the quotient + */ +HDI int operator/(int n, const FastIntDiv& divisor) +{ + if (divisor.d == 1) return n; + int ret = (int64_t(divisor.m) * int64_t(n)) >> divisor.p; + if (n < 0) ++ret; + return ret; +} + +/** + * @brief Modulo overload, so that FastIntDiv can be transparently switched + * to even on device + * @param n numerator + * @param divisor the denominator + * @return the remainder + */ +HDI int operator%(int n, const FastIntDiv& divisor) +{ + int quotient = n / divisor; + int remainder = n - quotient * divisor.d; + return remainder; +} + +}; // namespace raft::util diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu new file mode 100644 index 0000000000..c893e9a358 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::GramMatrixBase; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu new file mode 100644 index 0000000000..3265f828e6 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::GramMatrixBase; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu new file mode 100644 index 0000000000..0edf45a6f1 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::PolynomialKernel; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu new file mode 100644 index 0000000000..a719175e6b --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::PolynomialKernel; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu new file mode 100644 index 0000000000..6577e1b6c7 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::RBFKernel; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu new file mode 100644 index 0000000000..1d2582cf81 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::RBFKernel; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu new file mode 100644 index 0000000000..13d5159504 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::TanhKernel; \ No newline at end of file diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu new file mode 100644 index 0000000000..ee62de7d34 --- /dev/null +++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +template class raft::distance::kernels::detail::TanhKernel; \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index b10de9d1cc..07ec85bf1e 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -116,6 +116,7 @@ if(BUILD_TESTS) test/distance/dist_minkowski.cu test/distance/dist_russell_rao.cu test/distance/fused_l2_nn.cu + test/distance/gram.cu OPTIONAL DIST ) diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu new file mode 100644 index 0000000000..cf7215bddb --- /dev/null +++ b/cpp/test/distance/gram.cu @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined RAFT_DISTANCE_COMPILED +#include +#endif + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::distance::kernels { + +// Get the offset of element [i,k]. +HDI int get_offset(int i, int k, int ld, bool is_row_major) +{ + return is_row_major ? i * ld + k : i + k * ld; +} + +struct GramMatrixInputs { + int n1; // feature vectors in matrix 1 + int n2; // featuer vectors in matrix 2 + int n_cols; // number of elements in a feature vector + bool is_row_major; + KernelParams kernel; + int ld1; + int ld2; + int ld_out; + // We will generate random input using the dimensions given here. + // The reference output is calculated by a custom kernel. +}; + +std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p) +{ + std::vector kernel_names{"linear", "poly", "rbf", "tanh"}; + os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/" + << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_" + << p.ld1 << "x" << p.ld2 << "x" << p.ld_out; + return os; +} + +const std::vector inputs = { + {42, 137, 2, false, {KernelType::LINEAR}}, + {42, 137, 2, true, {KernelType::LINEAR}}, + {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181}, + {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181}, + {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}}, + {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}}, + {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144}, + {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144}, + {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}}, + {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}}, + {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49}, + {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143}, + {3, 4, 2, false, {KernelType::RBF, 0, 0.5}}, + {42, 137, 2, false, {KernelType::RBF, 0, 0.5}}, + {42, 137, 2, true, {KernelType::RBF, 0, 0.5}}, + // Distance kernel does not support LD parameter yet. + //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49}, + // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143}, +}; + +template +class GramMatrixTest : public ::testing::TestWithParam { + protected: + GramMatrixTest() + : params(GetParam()), stream(0), x1(0, stream), x2(0, stream), gram(0, stream), gram_host(0) + { + RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + + if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } + if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } + if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; } + // Derive the size of the ouptut from the offset of the last element. + size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1; + x1.resize(size, stream); + size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1; + x2.resize(size, stream); + size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1; + + gram.resize(size, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream)); + gram_host.resize(gram.size()); + std::fill(gram_host.begin(), gram_host.end(), 0); + + raft::random::Rng r(42137ULL); + r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream); + r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream); + } + + ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); } + + // Calculate the Gram matrix on the host. + void naiveKernel() + { + std::vector x1_host(x1.size()); + raft::update_host(x1_host.data(), x1.data(), x1.size(), stream); + std::vector x2_host(x2.size()); + raft::update_host(x2_host.data(), x2.data(), x2.size(), stream); + handle.sync_stream(stream); + + for (int i = 0; i < params.n1; i++) { + for (int j = 0; j < params.n2; j++) { + float d = 0; + for (int k = 0; k < params.n_cols; k++) { + if (params.kernel.kernel == KernelType::RBF) { + math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] - + x2_host[get_offset(j, k, params.ld2, params.is_row_major)]; + d += diff * diff; + } else { + d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] * + x2_host[get_offset(j, k, params.ld2, params.is_row_major)]; + } + } + int idx = get_offset(i, j, params.ld_out, params.is_row_major); + math_t v = 0; + switch (params.kernel.kernel) { + case (KernelType::LINEAR): gram_host[idx] = d; break; + case (KernelType::POLYNOMIAL): + v = params.kernel.gamma * d + params.kernel.coef0; + gram_host[idx] = std::pow(v, params.kernel.degree); + break; + case (KernelType::TANH): + gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0); + break; + case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break; + } + } + } + } + + void runTest() + { + std::unique_ptr> kernel = std::unique_ptr>( + KernelFactory::create(params.kernel, handle.get_cublas_handle())); + + kernel->evaluate(x1.data(), + params.n1, + params.n_cols, + x2.data(), + params.n2, + gram.data(), + params.is_row_major, + stream, + params.ld1, + params.ld2, + params.ld_out); + naiveKernel(); + ASSERT_TRUE(raft::devArrMatchHost( + gram_host.data(), gram.data(), gram.size(), raft::CompareApprox(1e-6f))); + } + + raft::handle_t handle; + cudaStream_t stream = 0; + GramMatrixInputs params; + + rmm::device_uvector x1; + rmm::device_uvector x2; + rmm::device_uvector gram; + std::vector gram_host; +}; + +typedef GramMatrixTest GramMatrixTestFloat; +typedef GramMatrixTest GramMatrixTestDouble; + +TEST_P(GramMatrixTestFloat, Gram) { runTest(); } + +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs)); +}; // end namespace raft::distance::kernels \ No newline at end of file