[FEA] Add support for select_k on CSR matrix

- This PR is one part of the feature of #1969 - Add the API of 'select_k' accepting CSR as input - Add the API of 'segmented_copy' Authors: - James Rong (https://github.com/rhdong) Approvers: - Ben Frederickson (https://github.com/benfred) - Micka (https://github.com/lowener) - Corey J. Nolet (https://github.com/cjnolet)
rapidsai · Jan 31, 2024 · 5071971 · 5071971
1 parent d4ae271
commit 5071971
Show file tree

Hide file tree

Showing 17 changed files with 1,237 additions and 3 deletions.
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
@@ -128,6 +128,7 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/matrix/argmin.cu
     bench/prims/matrix/gather.cu
     bench/prims/matrix/select_k.cu
+    bench/prims/matrix/select_k_csr.cu
     bench/prims/matrix/main.cpp
     OPTIONAL
     LIB

diff --git a/cpp/bench/prims/matrix/select_k_csr.cu b/cpp/bench/prims/matrix/select_k_csr.cu
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <rmm/device_uvector.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/matrix/select_k.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min         = true;
+  bool customized_indices = false;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << " rows*cols=" << params.n_rows << "*" << params.n_cols << "\ttop_k=" << params.top_k
+     << "\tsparsity=" << params.sparsity;
+  return os;
+}
+
+template <typename value_t, typename index_t>
+struct SelectKCsrTest : public fixture {
+  SelectKCsrTest(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      values_d(0, stream),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      dst_values_d(0, stream),
+      dst_indices_d(0, stream)
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    if (nnz) {
+      auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, nnz);
+      auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+      raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 nnz,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-10.0f),
+                                                 value_t(10.0f),
+                                                 uint64_t(2024));
+      raft::copy(values_d.data(), blobs_values.data_handle(), nnz, stream);
+      resource::sync_stream(handle);
+    }
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
+      raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+      resource::sync_stream(handle);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<index_t> dst_indices_d;
+};  // struct SelectKCsrTest
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    index_t k;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group =
+    raft::util::itertools::product<TestParams>({index_t(10), index_t(1024)},
+                                               {index_t(1024 * 10), index_t(1024 * 1024)},
+                                               {index_t(128), index_t(100), index_t(2048)},
+                                               {0.1f, 0.2f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, params.sparsity}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, int>), "", getInputs<int>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,6 +122,41 @@ void trunc_zero_origin(raft::resources const& handle,
                                       resource::get_cuda_stream(handle));
 }
 
+/**
+ * @brief Copy a specific number of elements row by row from the source vector to the target matrix
+ * according to the segment indicated by offsets
+ *
+ * @tparam m_t the type of the copied items.
+ * @tparam idx_t the index type of vectors and matrix.
+ * @param[in] handle raft handle
+ * @param[in] max_len_per_row Maximum number of copies per row
+ * @param[in] src Source vector
+ * @param[in] offsets Indicates the starting and ending index of each row in the vector
+ * @param[out] dst Destination matrix in row major order
+ *
+ * @note When the length of one segment is less than max_len_per_row, the remaining position values
+ * of dst will remain unchanged.
+ */
+template <typename m_t, typename idx_t>
+void segmented_copy(raft::resources const& handle,
+                    idx_t max_len_per_row,
+                    raft::device_vector_view<m_t, idx_t> src,
+                    raft::device_vector_view<idx_t, idx_t> offsets,
+                    raft::device_matrix_view<m_t, idx_t, row_major> dst)
+{
+  RAFT_EXPECTS(static_cast<idx_t>(offsets.size()) == (dst.extent(0) + 1),
+               "Number of offsets must be larger than number of output rows by 1");
+  RAFT_EXPECTS(dst.extent(1) >= max_len_per_row,
+               "Number of rows in the out must be equal or larger than max_len_per_row");
+  detail::segmented_copy(handle,
+                         src.data_handle(),
+                         dst.extent(0),
+                         dst.extent(1),
+                         max_len_per_row,
+                         offsets.data_handle(),
+                         dst.data_handle());
+}
+
 /** @} */  // end of group matrix_copy
 
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -316,6 +316,71 @@ m_t getL2Norm(raft::resources const& handle, const m_t* in, idx_t size, cudaStre
   return normval;
 }
 
+// Threads per block in segmented_copy_kernel.
+static const constexpr int SEGMENTED_COPY_TPB_256 = 256;
+static const constexpr int SEGMENTED_COPY_TPB_32  = 32;
+
+template <typename m_t, typename idx_t, idx_t TPB>
+RAFT_KERNEL __launch_bounds__(TPB) segmented_copy_kernel(
+  const m_t* src, idx_t n_rows, idx_t n_cols, idx_t max_len_per_row, idx_t* offsets, m_t* dst)
+{
+#pragma unroll
+  for (idx_t row_id = blockIdx.y; row_id < n_rows; row_id += gridDim.y) {
+    idx_t segment_start = offsets[row_id];
+    idx_t len           = min(offsets[row_id + 1] - segment_start, max_len_per_row);
+    for (idx_t col_id = threadIdx.x + blockIdx.x * blockDim.x; col_id < len;
+         col_id += blockDim.x * gridDim.x) {
+      dst[row_id * n_cols + col_id] = src[segment_start + col_id];
+    }
+  }
+}
+
+template <typename m_t, typename idx_t>
+void segmented_copy(raft::resources const& handle,
+                    const m_t* src,
+                    idx_t n_rows,
+                    idx_t n_cols,
+                    idx_t max_len_per_row,
+                    idx_t* offsets,
+                    m_t* dst)
+{
+  auto stream = resource::get_cuda_stream(handle);
+
+  idx_t tpb = max_len_per_row >= 256 ? SEGMENTED_COPY_TPB_256 : SEGMENTED_COPY_TPB_32;
+
+  int dev_id, sm_count, blocks_per_sm;
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (tpb == SEGMENTED_COPY_TPB_32) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_32>, tpb, 0);
+  } else if (tpb == SEGMENTED_COPY_TPB_256) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_256>, tpb, 0);
+  }
+
+  idx_t max_active_blocks = sm_count * blocks_per_sm;
+  // `max threads number = sm_count * blocks_per_sm * tpb`
+  // `problem size = n_rows * max_len_per_row`
+  idx_t required_active_blocks =
+    raft::min(max_active_blocks, raft::ceildiv(n_rows * max_len_per_row, tpb));
+
+  idx_t blocks_per_row = raft::ceildiv(required_active_blocks, n_rows);
+  idx_t grid_rows      = raft::ceildiv(required_active_blocks, blocks_per_row);
+  dim3 block(tpb, 1);
+  dim3 grid(blocks_per_row, grid_rows);
+
+  if (tpb == SEGMENTED_COPY_TPB_32) {
+    segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_32>
+      <<<grid, block, 0, stream>>>(src, n_rows, n_cols, max_len_per_row, offsets, dst);
+  } else if (tpb == SEGMENTED_COPY_TPB_256) {
+    segmented_copy_kernel<m_t, idx_t, SEGMENTED_COPY_TPB_256>
+      <<<grid, block, 0, stream>>>(src, n_rows, n_cols, max_len_per_row, offsets, dst);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // end namespace detail
 }  // end namespace matrix
 }  // end namespace raft