From cb77979f1a1d296d5d291c734eb57749cb9a7483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malte=20F=C3=B6rster?= <97973773+mfoerste4@users.noreply.github.com> Date: Thu, 22 Jun 2023 18:44:55 +0200 Subject: [PATCH] Fix launchconfig y-gridsize too large in epilogue kernel (#1586) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the launch config for epilogue kernels in the GramMatrix computation for large number of columns in the resulting kernel matrix. This code-path is triggered when predicting scores with a support vector space larger than 262140 which would result in a y-grid-dimension larger than 65535. Although this is not a regression this code path might get hit more often now that we allow sparse input data, so we might want to fix it in 23.06 as well. CC @tfeher . Authors: - Malte Förster (https://github.com/mfoerste4) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1586 --- .../detail/kernels/kernel_matrices.cuh | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh index 234265dbc1..f02e29c797 100644 --- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh +++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh @@ -135,6 +135,17 @@ __global__ void rbf_kernel_expanded( } } +namespace { +std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) +{ + dim3 block_shape = dim3(32, 4); + const int num_blocks_x = raft::ceildiv(n1, 32); + const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); + dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); + return std::make_tuple(grid_shape, block_shape); +} +} // namespace + /** * Create a kernel matrix using polynomial kernel function. */ @@ -152,12 +163,11 @@ class PolynomialKernel : public GramMatrixBase { polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( inout, rows * cols, exponent, gain, offset); } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - polynomial_kernel<<>>(inout, ld, n1, n2, exponent, gain, offset); + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + polynomial_kernel<<>>( + inout, ld, n1, n2, exponent, gain, offset); } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -327,12 +337,10 @@ class TanhKernel : public GramMatrixBase { tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( inout, rows * cols, gain, offset); } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -498,14 +506,13 @@ class RBFKernel : public GramMatrixBase { bool is_row_major, cudaStream_t stream) { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; - math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; - rbf_kernel_expanded<<>>(inout, ld, n1, n2, norm_n1, norm_n2, gain); + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; + math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + rbf_kernel_expanded<<>>( + inout, ld, n1, n2, norm_n1, norm_n2, gain); } public: @@ -576,7 +583,6 @@ class RBFKernel : public GramMatrixBase { math_t* norm_x2) { cudaStream_t stream = resource::get_cuda_stream(handle); - // lazy compute norms if not given rmm::device_uvector tmp_norm_x1(0, stream); rmm::device_uvector tmp_norm_x2(0, stream);