From 5f2bd19276d4314556795c86fa2230ccac988583 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 28 Mar 2024 12:33:39 -0500
Subject: [PATCH 1/4] Use `conda env create --yes` instead of `--force`.
 (#2247)

---
 ci/build_docs.sh  | 2 +-
 ci/check_style.sh | 4 ++--
 ci/test_cpp.sh    | 2 +-
 ci/test_python.sh | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 3d72c815db..9605b52f8b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -11,7 +11,7 @@ rapids-dependency-file-generator \
   --file_key docs \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n docs
+rapids-mamba-retry env create --yes -f env.yaml -n docs
 conda activate docs
 
 rapids-print-env
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 0ee6e88e58..d7baa88e8f 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -11,7 +11,7 @@ rapids-dependency-file-generator \
   --file_key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n checks
+rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
 # Run pre-commit checks
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index fb2d025f9c..f83ddf616d 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -14,7 +14,7 @@ rapids-dependency-file-generator \
   --file_key test_cpp \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n test
+rapids-mamba-retry env create --yes -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u
diff --git a/ci/test_python.sh b/ci/test_python.sh
index aae8ae03ea..f5b188ca0b 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -14,7 +14,7 @@ rapids-dependency-file-generator \
   --file_key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
-rapids-mamba-retry env create --force -f env.yaml -n test
+rapids-mamba-retry env create --yes -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u

From eabe3b00dad4225b00cb93d16cf1918b213b0ae3 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Thu, 4 Apr 2024 02:11:48 +0900
Subject: [PATCH 2/4] Add CAGRA-Q subspace dim = 4 support (#2244)

This PR adds the support for subspace dim (pq_dim) = 4 in CAGRA-Q

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2244
---
 .../neighbors/detail/cagra/cagra_search.cuh   |  3 +-
 .../detail/cagra/compute_distance_vpq.cuh     | 29 ++++++++++---------
 .../raft/neighbors/detail/vpq_dataset.cuh     |  2 +-
 cpp/test/neighbors/ann_cagra_vpq.cuh          |  4 +--
 4 files changed, 21 insertions(+), 17 deletions(-)
 mode change 100755 => 100644 cpp/test/neighbors/ann_cagra_vpq.cuh

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index d30f69ddcd..ccfe3c7e2d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -166,7 +166,8 @@ void launch_vpq_search_main_core(
   CagraSampleFilterT sample_filter)
 {
   RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now");
-  RAFT_EXPECTS(vpq_dset->pq_len() == 2, "Only pq_len 2 is supported for now");
+  RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4,
+               "Only pq_len 2 or 4 is supported for now");
   RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0,
                "dim must be a multiple of pq_dim at the moment");
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
index 0204addba7..e73d24bfb6 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
@@ -33,6 +33,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   using CODE_BOOK_T = CODE_BOOK_T_;
   using QUERY_T     = typename dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::QUERY_T;
 
+  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
+
   const std::uint8_t* encoded_dataset_ptr;
   const std::uint32_t encoded_dataset_dim;
   const std::uint32_t n_subspace;
@@ -53,18 +55,19 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
     smem_pq_code_book_ptr = reinterpret_cast<CODE_BOOK_T*>(smem_ptr);
 
     // Copy PQ table
-    if constexpr (std::is_same<CODE_BOOK_T, half>::value) {
-      for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-        half2 buf2;
-        buf2.x                                                   = pq_code_book_ptr[i];
-        buf2.y                                                   = pq_code_book_ptr[i + 1];
-        (reinterpret_cast<half2*>(smem_pq_code_book_ptr + i))[0] = buf2;
-      }
-    } else {
-      for (unsigned i = threadIdx.x; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x) {
-        // TODO: vectorize
-        smem_pq_code_book_ptr[i] = pq_code_book_ptr[i];
-      }
+    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
+      half2 buf2;
+      buf2.x = pq_code_book_ptr[i];
+      buf2.y = pq_code_book_ptr[i + 1];
+
+      // Change the order of PQ code book array to reduce the
+      // frequency of bank conflicts.
+      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
+      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+      const auto j                          = i / num_elements_per_bank;
+      const auto smem_index =
+        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+      reinterpret_cast<half2*>(smem_pq_code_book_ptr)[smem_index] = buf2;
     }
   }
 
@@ -136,7 +139,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
               4 + k));
           }
           //
-          if constexpr ((std::is_same<CODE_BOOK_T, half>::value) && (PQ_LEN % 2 == 0)) {
+          if constexpr (PQ_LEN % 2 == 0) {
             // **** Use half2 for distance computation ****
             half2 norm2{0, 0};
 #pragma unroll
diff --git a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
index f1321ba343..f6cd2a1ceb 100644
--- a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
+++ b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
@@ -81,7 +81,7 @@ auto fill_missing_params_heuristics(const vpq_params& params, const DatasetT& da
   vpq_params r  = params;
   double n_rows = dataset.extent(0);
   size_t dim    = dataset.extent(1);
-  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{2}); }
+  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{4}); }
   if (r.pq_bits == 0) { r.pq_bits = 8; }
   if (r.vq_n_centers == 0) { r.vq_n_centers = raft::round_up_safe<uint32_t>(std::sqrt(n_rows), 8); }
   if (r.vq_kmeans_trainset_fraction == 0) {
diff --git a/cpp/test/neighbors/ann_cagra_vpq.cuh b/cpp/test/neighbors/ann_cagra_vpq.cuh
old mode 100755
new mode 100644
index 503b1a413a..6b24bca921
--- a/cpp/test/neighbors/ann_cagra_vpq.cuh
+++ b/cpp/test/neighbors/ann_cagra_vpq.cuh
@@ -158,7 +158,7 @@ class AnnCagraVpqTest : public ::testing::TestWithParam<AnnCagraVpqInputs> {
       resource::sync_stream(handle_);
     }
 
-    const auto vpq_k = ps.k * 16;
+    const auto vpq_k = ps.k * 4;
     {
       rmm::device_uvector<DistanceT> distances_dev(vpq_k * ps.n_queries, stream_);
       rmm::device_uvector<IdxT> indices_dev(vpq_k * ps.n_queries, stream_);
@@ -319,7 +319,7 @@ const std::vector<AnnCagraVpqInputs> vpq_inputs = raft::util::itertools::product
   {1000, 10000},                                      // n_rows
   {128, 132, 192, 256, 512, 768},                     // dim
   {8, 12},                                            // k
-  {2},                                                // pq_len
+  {2, 4},                                             // pq_len
   {8},                                                // pq_bits
   {graph_build_algo::NN_DESCENT},                     // build_algo
   {search_algo::SINGLE_CTA, search_algo::MULTI_CTA},  // algo

From 8a68518fd5a0ae0e750cd8f77b02f73efc111f5c Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Thu, 4 Apr 2024 19:26:04 +0200
Subject: [PATCH 3/4] Fix time computation in CAGRA notebook (#2231)

Closes #2230.
I am also adding `nn_descent` to the build parameters of cagra

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2231
---
 .../VectorSearch_QuestionRetrieval.ipynb      | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb
index b3a15d3a08..33a2f60228 100644
--- a/notebooks/VectorSearch_QuestionRetrieval.ipynb
+++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "eb1e81c3",
    "metadata": {},
    "outputs": [
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "ee4c5cc0",
    "metadata": {},
    "outputs": [
@@ -184,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "0a1a6307",
    "metadata": {},
    "outputs": [
@@ -249,7 +249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "ad90b4be",
    "metadata": {},
    "outputs": [
@@ -292,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "724dcacb",
    "metadata": {
     "scrolled": true
@@ -320,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "c27d4715",
    "metadata": {},
    "outputs": [
@@ -347,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "bc375518",
    "metadata": {},
    "outputs": [
@@ -373,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "ab154181",
    "metadata": {},
    "outputs": [
@@ -399,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "2d6017ed",
    "metadata": {},
    "outputs": [
@@ -435,7 +435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "f5cfb644",
    "metadata": {},
    "outputs": [
@@ -462,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "b5694d00",
    "metadata": {},
    "outputs": [
@@ -489,7 +489,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "fcfc3c5b",
    "metadata": {},
    "outputs": [
@@ -528,7 +528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "50df1f43-c580-4019-949a-06bdc7185536",
    "metadata": {},
    "outputs": [],
@@ -538,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "091cde52-4652-4230-af2b-75c35357f833",
    "metadata": {},
    "outputs": [
@@ -546,21 +546,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1min 23s, sys: 2min 7s, total: 3min 31s\n",
-      "Wall time: 4min 43s\n"
+      "CPU times: user 35.3 s, sys: 4.5 s, total: 39.8 s\n",
+      "Wall time: 2.16 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "params = cagra.IndexParams(intermediate_graph_degree=128, graph_degree=64)\n",
+    "params = cagra.IndexParams(intermediate_graph_degree=32, graph_degree=16, build_algo=\"nn_descent\")\n",
     "cagra_index = cagra.build(params, corpus_embeddings)\n",
-    "search_params = cagra.SearchParams()"
+    "search_params = cagra.SearchParams(algo=\"multi_cta\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "id": "df229e21-f6b6-4d6c-ad54-2724f8738934",
    "metadata": {},
    "outputs": [],
@@ -569,9 +569,12 @@
     "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
     "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
     "\n",
+    "    start_time = time.time()\n",
     "    hits = cagra.search(search_params, cagra_index, question_embedding[None], top_k)\n",
+    "    end_time = time.time()\n",
     "\n",
     "    # Output of top-k hits\n",
+    "    print(\"Results (after {:.3f} seconds):\".format(end_time - start_time))\n",
     "    print(\"Input question:\", query)\n",
     "    for k in range(top_k):\n",
     "        print(\"\\t{:.3f}\\t{}\".format(hits[0][0, k], passages[hits[1][0, k]]))"
@@ -587,19 +590,20 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 16 µs, sys: 25 µs, total: 41 µs\n",
-      "Wall time: 83.7 µs\n",
+      "Results (after 0.005 seconds):\n",
       "Input question: Who was Grace Hopper?\n",
       "\t181.649\t['Grace Hopper', 'Hopper was born in New York, USA. Hopper graduated from Vassar College in 1928 and Yale University in 1934 with a Ph.D degree in mathematics. She joined the US Navy during the World War II in 1943. She worked on computers in the Navy for 43 years. She then worked in other private industry companies after 1949. She retired from the Navy in 1986 and died on January 1, 1992.']\n",
       "\t192.946\t['Leona Helmsley', 'Leona Helmsley (July 4, 1920 – August 20, 2007) was an American businesswoman. She was known for having a flamboyant personality. She had a reputation for tyrannical behavior; she was nicknamed the Queen of Mean.']\n",
       "\t194.951\t['Grace Hopper', 'Grace Murray Hopper (December 9 1906 – January 1 1992) was an American computer scientist and United States Navy officer.']\n",
       "\t202.192\t['Nellie Bly', 'Elizabeth Cochrane Seaman (born Elizabeth Jane Cochran; May 5, 1864 – January 27, 1922), better known by her pen name Nellie Bly, was an American journalist, novelist and inventor. She was a newspaper reporter, who worked at various jobs for exposing poor working conditions. Nellie Bly, also, fought for women\\'s right and was known for investigative reporting. She best known for her record-breaking trip around the world in 72 days, inspired by the adventure novel \"Around the World in Eighty Days\" by Jules Verne. In the 1880s, she went undercover as a mentally ill patient in a psychiatric hospital for ten days, with the report being made public in a book called \"\"Ten Days in a Mad-House\"\". She was added to the National Women\\'s Hall of Fame in 1998.']\n",
-      "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n"
+      "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n",
+      "CPU times: user 4.18 ms, sys: 3.88 ms, total: 8.07 ms\n",
+      "Wall time: 9.97 ms\n"
      ]
     }
    ],
    "source": [
-    "%time \n",
+    "%%time \n",
     "search_raft_cagra(query=\"Who was Grace Hopper?\")"
    ]
   }
@@ -620,7 +624,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,

From 4a20d03af7f6181e3083bc3b65522d7f2c3b6218 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Mon, 8 Apr 2024 09:36:00 -0700
Subject: [PATCH 4/4] [FEA] Add support for `select_k` on CSR matrix (#2140)

- This PR is one part of the feature of #1969
- Add the API of 'select_k' accepting CSR as input
Authors:
  - James Rong (https://github.com/rhdong)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2140
---
 cpp/bench/prims/CMakeLists.txt                |   1 +
 cpp/bench/prims/sparse/select_k_csr.cu        | 287 ++++++++++++
 .../raft/matrix/detail/select_radix.cuh       | 427 ++++++++++--------
 .../raft/matrix/detail/select_warpsort.cuh    |  55 ++-
 .../sparse/matrix/detail/select_k-ext.cuh     |  67 +++
 .../sparse/matrix/detail/select_k-inl.cuh     | 225 +++++++++
 .../raft/sparse/matrix/detail/select_k.cuh    |  24 +
 cpp/include/raft/sparse/matrix/select_k.cuh   |  87 ++++
 .../matrix/detail/select_k_double_int64_t.cu  |  32 ++
 .../matrix/detail/select_k_double_uint32_t.cu |  34 ++
 .../matrix/detail/select_k_float_int32.cu     |  32 ++
 .../matrix/detail/select_k_float_int64_t.cu   |  32 ++
 .../matrix/detail/select_k_float_uint32_t.cu  |  32 ++
 .../matrix/detail/select_k_half_int64_t.cu    |  32 ++
 .../matrix/detail/select_k_half_uint32_t.cu   |  32 ++
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/sparse/select_k_csr.cu               | 398 ++++++++++++++++
 17 files changed, 1600 insertions(+), 198 deletions(-)
 create mode 100644 cpp/bench/prims/sparse/select_k_csr.cu
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/select_k.cuh
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int32.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
 create mode 100644 cpp/test/sparse/select_k_csr.cu

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9f23c44a5c..0c5521d447 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -137,6 +137,7 @@ if(BUILD_PRIMS_BENCH)
     PATH
     bench/prims/sparse/bitmap_to_csr.cu
     bench/prims/sparse/convert_csr.cu
+    bench/prims/sparse/select_k_csr.cu
     bench/prims/main.cpp
   )
 
diff --git a/cpp/bench/prims/sparse/select_k_csr.cu b/cpp/bench/prims/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..a91e6c8514
--- /dev/null
+++ b/cpp/bench/prims/sparse/select_k_csr.cu
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min         = true;
+  bool customized_indices = false;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << params.n_rows << "#" << params.n_cols << "#" << params.top_k << "#" << params.sparsity;
+  return os;
+}
+
+template <typename value_t, typename index_t>
+struct SelectKCsrTest : public fixture {
+  SelectKCsrTest(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      values_d(0, stream),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      dst_values_d(0, stream),
+      dst_indices_d(0, stream)
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    if (nnz) {
+      auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, nnz);
+      auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+      raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 nnz,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-10.0f),
+                                                 value_t(10.0f),
+                                                 uint64_t(2024));
+      raft::copy(values_d.data(), blobs_values.data_handle(), nnz, stream);
+      resource::sync_stream(handle);
+    }
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
+      raft::sparse::matrix::select_k(
+        handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
+      resource::sync_stream(handle);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<index_t> dst_indices_d;
+};  // struct SelectKCsrTest
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    index_t k;
+  };
+
+  const std::vector<TestParams> params_group{
+    {20000, 500, 1},    {20000, 500, 2},    {20000, 500, 4},   {20000, 500, 8},
+    {20000, 500, 16},   {20000, 500, 32},   {20000, 500, 64},  {20000, 500, 128},
+    {20000, 500, 256},
+
+    {1000, 10000, 1},   {1000, 10000, 2},   {1000, 10000, 4},  {1000, 10000, 8},
+    {1000, 10000, 16},  {1000, 10000, 32},  {1000, 10000, 64}, {1000, 10000, 128},
+    {1000, 10000, 256},
+
+    {100, 100000, 1},   {100, 100000, 2},   {100, 100000, 4},  {100, 100000, 8},
+    {100, 100000, 16},  {100, 100000, 32},  {100, 100000, 64}, {100, 100000, 128},
+    {100, 100000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256}};
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.1}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.2}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.5}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, uint32_t>), "", getInputs<uint32_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 36a346fda3..83d4845c31 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -442,14 +442,76 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass>
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   int pass,
+                                   const T*& in_buf,
+                                   const IdxT*& in_idx_buf,
+                                   T*& out_buf,
+                                   IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    in_buf      = reinterpret_cast<T*>(bufs);
+    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    out_buf     = const_cast<T*>(in_buf + buf_len);
+    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    in_buf      = out_buf + buf_len;
+    in_idx_buf  = out_idx_buf + buf_len;
+  }
+}
+
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   const int pass,
+                                   const T*& out_buf,
+                                   const IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    out_buf = const_cast<T*>(reinterpret_cast<T*>(bufs) + buf_len);
+    out_idx_buf =
+      const_cast<IdxT*>(reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len) + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  }
+}
+
+template <typename T, typename IdxT, int BitsPerPass, bool len_or_indptr = true>
 RAFT_KERNEL last_filter_kernel(const T* in,
                                const IdxT* in_idx,
-                               const T* in_buf,
-                               const IdxT* in_idx_buf,
+                               char* bufs,
+                               size_t offset,
                                T* out,
                                IdxT* out_idx,
                                const IdxT len,
+                               const IdxT* len_i,
                                const IdxT k,
                                Counter<T, IdxT>* counters,
                                const bool select_min)
@@ -458,22 +520,31 @@ RAFT_KERNEL last_filter_kernel(const T* in,
 
   Counter<T, IdxT>* counter = counters + batch_id;
   IdxT previous_len         = counter->previous_len;
+
   if (previous_len == 0) { return; }
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   const IdxT buf_len = calc_buf_len<T>(len);
-  if (previous_len > buf_len || in_buf == in) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
-  }
-  out += batch_id * k;
-  out_idx += batch_id * k;
+
+  const T* in_buf        = nullptr;
+  const IdxT* in_idx_buf = nullptr;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
   constexpr int pass      = calc_num_passes<T, BitsPerPass>() - 1;
   constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
 
+  set_buf_pointers(in + l_offset, in_idx + l_offset, bufs, buf_len, pass, in_buf, in_idx_buf);
+
+  if (previous_len > buf_len || in_buf == in + l_offset) {
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
+  }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+
   const auto kth_value_bits    = counter->kth_value_bits;
   const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
@@ -510,6 +581,29 @@ RAFT_KERNEL last_filter_kernel(const T* in,
                      f);
 }
 
+template <typename T, typename IdxT, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_val(
+  T* dest, const T* src, S len, IdxT k, const bool select_min)
+{
+  S idx               = S(threadIdx.x);
+  S stride            = S(blockDim.x);
+  const T default_val = select_min ? upper_bound<T>() : lower_bound<T>();
+  for (S i = idx; i < k; i += stride) {
+    dest[i] = i < len ? src[i] : default_val;
+  }
+}
+
+template <typename T, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_idx(T* dest, const T* src, S len)
+{
+  S idx    = S(threadIdx.x);
+  S stride = S(blockDim.x);
+
+  for (S i = idx; i < len; i += stride) {
+    dest[i] = src ? src[i] : i;
+  }
+}
+
 /**
  *
  * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
@@ -545,13 +639,16 @@ RAFT_KERNEL last_filter_kernel(const T* in,
  * rather than from `in_buf`. The benefit is that we can save the cost of writing candidates and
  * their indices.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
+template <typename T,
+          typename IdxT,
+          int BitsPerPass,
+          int BlockSize,
+          bool fused_last_filter,
+          bool len_or_indptr>
 RAFT_KERNEL radix_kernel(const T* in,
                          const IdxT* in_idx,
-                         const T* in_buf,
-                         const IdxT* in_idx_buf,
-                         T* out_buf,
-                         IdxT* out_idx_buf,
+                         char* bufs,
+                         size_t offset,
                          T* out,
                          IdxT* out_idx,
                          Counter<T, IdxT>* counters,
@@ -567,21 +664,38 @@ RAFT_KERNEL radix_kernel(const T* in,
   IdxT current_k;
   IdxT previous_len;
   IdxT current_len;
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   if (pass == 0) {
     current_k    = k;
-    previous_len = len;
+    previous_len = l_len;
     // Need to do this so setting counter->previous_len for the next pass is correct.
     // This value is meaningless for pass 0, but it's fine because pass 0 won't be the
     // last pass in this implementation so pass 0 won't hit the "if (pass ==
     // num_passes - 1)" branch.
     // Maybe it's better to reload counter->previous_len and use it rather than
     // current_len in last_filter()
-    current_len = len;
+    current_len = l_len;
   } else {
     current_k    = counter->k;
     current_len  = counter->len;
     previous_len = counter->previous_len;
   }
+  if constexpr (!len_or_indptr) {
+    if (pass == 0 && l_len <= k) {
+      copy_in_val(out + batch_id * k, in + l_offset, l_len, k, select_min);
+      copy_in_idx(out_idx + batch_id * k, (in_idx ? (in_idx + l_offset) : nullptr), l_len);
+      if (threadIdx.x == 0) {
+        counter->previous_len = 0;
+        counter->len          = 0;
+      }
+      __syncthreads();
+      return;
+    }
+  }
+
   if (current_len == 0) { return; }
 
   // When k=len, early_stop will be true at pass 0. It means filter_and_histogram() should handle
@@ -590,20 +704,33 @@ RAFT_KERNEL radix_kernel(const T* in,
   const bool early_stop = (current_len == current_k);
   const IdxT buf_len    = calc_buf_len<T>(len);
 
+  const T* in_buf;
+  const IdxT* in_idx_buf;
+  T* out_buf;
+  IdxT* out_idx_buf;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
+
+  set_buf_pointers(in + l_offset,
+                   (in_idx ? (in_idx + l_offset) : nullptr),
+                   bufs,
+                   buf_len,
+                   pass,
+                   in_buf,
+                   in_idx_buf,
+                   out_buf,
+                   out_idx_buf);
+
   // "previous_len > buf_len" means previous pass skips writing buffer
   if (pass == 0 || pass == 1 || previous_len > buf_len) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
   }
 
   // in case we have individual len for each query defined we want to make sure
   // that we only iterate valid elements.
   if (len_i != nullptr) {
-    const IdxT max_len = max(len_i[batch_id], k);
+    const IdxT max_len = max(l_len, k);
     if (max_len < previous_len) previous_len = max_len;
   }
 
@@ -611,9 +738,6 @@ RAFT_KERNEL radix_kernel(const T* in,
   if (pass == 0 || current_len > buf_len) {
     out_buf     = nullptr;
     out_idx_buf = nullptr;
-  } else {
-    out_buf += batch_id * buf_len;
-    out_idx_buf += batch_id * buf_len;
   }
   out += batch_id * k;
   out_idx += batch_id * k;
@@ -640,7 +764,6 @@ RAFT_KERNEL radix_kernel(const T* in,
     unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
     isLastBlock           = (finished == (gridDim.x - 1));
   }
-
   if (__syncthreads_or(isLastBlock)) {
     if (early_stop) {
       if (threadIdx.x == 0) {
@@ -676,7 +799,7 @@ RAFT_KERNEL radix_kernel(const T* in,
                                           out_idx_buf ? out_idx_buf : in_idx_buf,
                                           out,
                                           out_idx,
-                                          out_buf ? current_len : len,
+                                          out_buf ? current_len : l_len,
                                           k,
                                           counter,
                                           select_min,
@@ -726,7 +849,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 
   int active_blocks;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>, BlockSize, 0));
+    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, true>, BlockSize, 0));
   active_blocks *= sm_cnt;
 
   IdxT best_num_blocks         = 0;
@@ -757,78 +880,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
   return best_num_blocks;
 }
 
-template <typename T, typename IdxT>
-_RAFT_HOST void set_buf_pointers(const T* in,
-                                 const IdxT* in_idx,
-                                 T* buf1,
-                                 IdxT* idx_buf1,
-                                 T* buf2,
-                                 IdxT* idx_buf2,
-                                 int pass,
-                                 const T*& in_buf,
-                                 const IdxT*& in_idx_buf,
-                                 T*& out_buf,
-                                 IdxT*& out_idx_buf)
-{
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  } else if (pass % 2 == 0) {
-    in_buf      = buf1;
-    in_idx_buf  = idx_buf1;
-    out_buf     = buf2;
-    out_idx_buf = idx_buf2;
-  } else {
-    in_buf      = buf2;
-    in_idx_buf  = idx_buf2;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  }
-}
-
-template <typename T, typename IdxT>
-_RAFT_DEVICE void set_buf_pointers(const T* in,
-                                   const IdxT* in_idx,
-                                   char* bufs,
-                                   IdxT buf_len,
-                                   int pass,
-                                   const T*& in_buf,
-                                   const IdxT*& in_idx_buf,
-                                   T*& out_buf,
-                                   IdxT*& out_idx_buf)
-{
-  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-  } else if (pass % 2 == 0) {
-    in_buf      = reinterpret_cast<T*>(bufs);
-    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    out_buf     = const_cast<T*>(in_buf + buf_len);
-    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
-  } else {
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    in_buf      = out_buf + buf_len;
-    in_idx_buf  = out_idx_buf + buf_len;
-  }
-}
-
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
                 int batch_size,
@@ -850,7 +902,7 @@ void radix_topk(const T* in,
 
   if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
 
-  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
   if (max_chunk_size != static_cast<size_t>(batch_size)) {
@@ -862,55 +914,33 @@ void radix_topk(const T* in,
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
-  rmm::device_uvector<T> buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<T> buf2(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * buf_len, stream, mr);
+
+  rmm::device_uvector<char> bufs(
+    max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
 
-    const T* chunk_in        = in + offset * len;
-    const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
-    T* chunk_out             = out + offset * k;
-    IdxT* chunk_out_idx      = out_idx + offset * k;
-    const IdxT* chunk_len_i  = len_i ? (len_i + offset) : nullptr;
-
-    const T* in_buf        = nullptr;
-    const IdxT* in_idx_buf = nullptr;
-    T* out_buf             = nullptr;
-    IdxT* out_idx_buf      = nullptr;
+    T* chunk_out            = out + offset * k;
+    IdxT* chunk_out_idx     = out_idx + offset * k;
+    const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
 
     dim3 blocks(grid_dim, chunk_size);
     constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
     for (int pass = 0; pass < num_passes; ++pass) {
-      set_buf_pointers(chunk_in,
-                       chunk_in_idx,
-                       buf1.data(),
-                       idx_buf1.data(),
-                       buf2.data(),
-                       idx_buf2.data(),
-                       pass,
-                       in_buf,
-                       in_idx_buf,
-                       out_buf,
-                       out_idx_buf);
-
       if (fused_last_filter && pass == num_passes - 1) {
-        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true>;
+        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true, len_or_indptr>;
       }
 
-      kernel<<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                               chunk_in_idx,
-                                               in_buf,
-                                               in_idx_buf,
-                                               out_buf,
-                                               out_idx_buf,
+      kernel<<<blocks, BlockSize, 0, stream>>>(in,
+                                               in_idx,
+                                               bufs.data(),
+                                               offset,
                                                chunk_out,
                                                chunk_out_idx,
                                                counters.data(),
@@ -924,16 +954,18 @@ void radix_topk(const T* in,
     }
 
     if (!fused_last_filter) {
-      last_filter_kernel<T, IdxT, BitsPerPass><<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                                                                 chunk_in_idx,
-                                                                                 out_buf,
-                                                                                 out_idx_buf,
-                                                                                 chunk_out,
-                                                                                 chunk_out_idx,
-                                                                                 len,
-                                                                                 k,
-                                                                                 counters.data(),
-                                                                                 select_min);
+      last_filter_kernel<T, IdxT, BitsPerPass, len_or_indptr>
+        <<<blocks, BlockSize, 0, stream>>>(in,
+                                           in_idx,
+                                           bufs.data(),
+                                           offset,
+                                           chunk_out,
+                                           chunk_out_idx,
+                                           len,
+                                           chunk_len_i,
+                                           k,
+                                           counters.data(),
+                                           select_min);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
   }
@@ -1015,7 +1047,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         const IdxT* in_idx,
                                         const IdxT len,
@@ -1024,30 +1056,48 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         T* out,
                                         IdxT* out_idx,
                                         const bool select_min,
-                                        char* bufs)
+                                        char* bufs,
+                                        size_t offset)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
   __shared__ IdxT histogram[num_buckets];
 
+  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
+
+  IdxT l_len    = len;
+  IdxT l_offset = (offset + batch_id) * len;
+  if constexpr (!len_or_indptr) {
+    l_offset = len_i[batch_id];
+    l_len    = len_i[batch_id + 1] - l_offset;
+  }
+
   if (threadIdx.x == 0) {
     counter.k              = k;
-    counter.len            = len;
-    counter.previous_len   = len;
+    counter.len            = l_len;
+    counter.previous_len   = l_len;
     counter.kth_value_bits = 0;
     counter.out_cnt        = 0;
     counter.out_back_cnt   = 0;
   }
   __syncthreads();
 
-  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
-  in += batch_id * len;
-  if (in_idx) { in_idx += batch_id * len; }
+  in += l_offset;
+  if (in_idx) { in_idx += l_offset; }
   out += batch_id * k;
   out_idx += batch_id * k;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
+  if constexpr (!len_or_indptr) {
+    if (l_len <= k) {
+      copy_in_val(out, in, l_len, k, select_min);
+      copy_in_idx(out_idx, in_idx, l_len);
+      __syncthreads();
+      return;
+    }
+  }
+
   constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
   for (int pass = 0; pass < num_passes; ++pass) {
     const T* in_buf;
@@ -1073,7 +1123,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
     // in case we have individual len for each query defined we want to make sure
     // that we only iterate valid elements.
     if (len_i != nullptr) {
-      const IdxT max_len = max(len_i[batch_id], k);
+      const IdxT max_len = max(l_len, k);
       if (max_len < previous_len) previous_len = max_len;
     }
 
@@ -1102,7 +1152,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         out_buf ? out_idx_buf : in_idx,
                                         out,
                                         out_idx,
-                                        out_buf ? current_len : len,
+                                        out_buf ? current_len : l_len,
                                         k,
                                         &counter,
                                         select_min,
@@ -1117,7 +1167,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // counters and global histograms, can be kept in shared memory and cheap sync operations can be
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk_one_block(const T* in,
                           const IdxT* in_idx,
                           int batch_size,
@@ -1133,7 +1183,7 @@ void radix_topk_one_block(const T* in,
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
-  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, true);
@@ -1144,15 +1194,16 @@ void radix_topk_one_block(const T* in,
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
-    kernel<<<chunk_size, BlockSize, 0, stream>>>(in + offset * len,
-                                                 in_idx ? (in_idx + offset * len) : nullptr,
+    kernel<<<chunk_size, BlockSize, 0, stream>>>(in,
+                                                 in_idx,
                                                  len,
                                                  chunk_len_i,
                                                  k,
                                                  out + offset * k,
                                                  out_idx + offset * k,
                                                  select_min,
-                                                 bufs.data());
+                                                 bufs.data(),
+                                                 offset);
   }
 }
 
@@ -1182,6 +1233,10 @@ void radix_topk_one_block(const T* in,
  *   it affects the number of passes and number of buckets.
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
+ * @tparam len_or_indptr
+ *   Flag to interpret `len_i` as either direct row lengths (true) or CSR format
+ *   index pointers (false). When true, each `len_i` element denotes the length of a row. When
+ *   false, `len_i` represents the index pointers for a CSR matrix with shape of `batch_size + 1`.
  *
  * @param[in] res container of reusable resources
  * @param[in] in
@@ -1212,9 +1267,12 @@ void radix_topk_one_block(const T* in,
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
  * @param len_i
- *   optional array of size (batch_size) providing lengths for each individual row
+ *   Optional array used differently based on `len_or_indptr`:
+ *   When `len_or_indptr` is true, `len_i` presents the lengths of each row, which is `batch_size`.
+ *   When `len_or_indptr` is false, `len_i` works like a indptr for a CSR matrix. The length of each
+ *   row would be (`len_i[row_id + 1] - len_i[row_id]`). `len_i` size is `batch_size + 1`.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr = true>
 void select_k(raft::resources const& res,
               const T* in,
               const IdxT* in_idx,
@@ -1227,9 +1285,12 @@ void select_k(raft::resources const& res,
               bool fused_last_filter,
               const IdxT* len_i)
 {
+  RAFT_EXPECTS(!(!len_or_indptr && (len_i == nullptr)),
+               "When `len_or_indptr` is false, `len_i` must not be nullptr!");
+
   auto stream = resource::get_cuda_stream(res);
   auto mr     = resource::get_workspace_resource(res);
-  if (k == len) {
+  if (k == len && len_or_indptr) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1248,29 +1309,29 @@ void select_k(raft::resources const& res,
   constexpr int items_per_thread = 32;
 
   if (len <= BlockSize * items_per_thread) {
-    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize>(in,
-                                                        in_idx,
-                                                        batch_size,
-                                                        len,
-                                                        k,
-                                                        out,
-                                                        out_idx,
-                                                        select_min,
-                                                        fused_last_filter,
-                                                        len_i,
-                                                        grid_dim,
-                                                        sm_cnt,
-                                                        stream,
-                                                        mr);
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(in,
+                                                                       in_idx,
+                                                                       batch_size,
+                                                                       len,
+                                                                       k,
+                                                                       out,
+                                                                       out_idx,
+                                                                       select_min,
+                                                                       fused_last_filter,
+                                                                       len_i,
+                                                                       grid_dim,
+                                                                       sm_cnt,
+                                                                       stream,
+                                                                       mr);
     }
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 572558153d..2cb32585d5 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -754,22 +754,32 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool Ascending,
           typename T,
           typename IdxT>
-__launch_bounds__(256) RAFT_KERNEL
-  block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
+__launch_bounds__(256) RAFT_KERNEL block_kernel(const T* in,
+                                                const IdxT* in_idx,
+                                                const IdxT* in_indptr,
+                                                size_t offset,
+                                                IdxT len,
+                                                int k,
+                                                T* out,
+                                                IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
   using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
   uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
   bq_t queue(k, warp_smem);
+  const size_t batch_id = blockIdx.y;
 
-  in += blockIdx.y * len;
-  if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
+  const IdxT l_len    = in_indptr ? (in_indptr[batch_id + 1] - in_indptr[batch_id]) : len;
+  const IdxT l_offset = in_indptr ? in_indptr[batch_id] : (offset + batch_id) * len;
+
+  in += l_offset;
+  if (in_idx != nullptr) { in_idx += l_offset; }
 
   const IdxT stride         = gridDim.x * blockDim.x;
-  const IdxT per_thread_lim = len + laneId();
+  const IdxT per_thread_lim = l_len + laneId();
   for (IdxT i = threadIdx.x + blockIdx.x * blockDim.x; i < per_thread_lim; i += stride) {
-    queue.add(i < len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
-              (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
+    queue.add(i < l_len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
+              (i < l_len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
   queue.done(smem_buf_bytes);
@@ -832,6 +842,7 @@ struct launch_setup {
                      int smem_size,
                      const T* in_key,
                      const IdxT* in_idx,
+                     const IdxT* in_indptr,
                      T* out_key,
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
@@ -848,6 +859,7 @@ struct launch_setup {
                                                                           smem_size,
                                                                           in_key,
                                                                           in_idx,
+                                                                          in_indptr,
                                                                           out_key,
                                                                           out_idx,
                                                                           stream);
@@ -858,21 +870,23 @@ struct launch_setup {
     // This is less than cuda's max block dim along Y axis (65535), but it's a
     // power-of-two, which ensures the alignment of batches in memory.
     constexpr size_t kMaxGridDimY = 32768;
+    size_t g_offset               = 0;
     for (size_t offset = 0; offset < batch_size; offset += kMaxGridDimY) {
       size_t batch_chunk = std::min<size_t>(kMaxGridDimY, batch_size - offset);
       dim3 gs(num_blocks, batch_chunk, 1);
       if (select_min) {
-        block_kernel<WarpSortClass, Capacity, true, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, true, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       } else {
-        block_kernel<WarpSortClass, Capacity, false, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, false, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       }
       RAFT_CUDA_TRY(cudaPeekAtLastError());
       out_key += batch_chunk * num_blocks * k;
       out_idx += batch_chunk * num_blocks * k;
-      in_key += batch_chunk * len;
-      if (in_idx != nullptr) { in_idx += batch_chunk * len; }
+
+      if (in_indptr != nullptr) { in_indptr += batch_chunk; };
+      g_offset += batch_chunk;
     }
   }
 };
@@ -1010,6 +1024,7 @@ void select_k_(int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
+               const IdxT* in_indptr,
                size_t batch_size,
                size_t len,
                int k,
@@ -1041,6 +1056,7 @@ void select_k_(int num_of_block,
                                                smem_size,
                                                in,
                                                in_idx,
+                                               in_indptr,
                                                result_val,
                                                result_idx,
                                                stream);
@@ -1056,6 +1072,7 @@ void select_k_(int num_of_block,
                                                  smem_size,
                                                  tmp_val.data(),
                                                  tmp_idx.data(),
+                                                 nullptr,
                                                  out,
                                                  out_idx,
                                                  stream);
@@ -1071,7 +1088,8 @@ void select_k_impl(raft::resources const& res,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min)
+                   bool select_min,
+                   const IdxT* in_indptr = nullptr)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
@@ -1082,6 +1100,7 @@ void select_k_impl(raft::resources const& res,
                                     num_of_warp,
                                     in,
                                     in_idx,
+                                    in_indptr,
                                     batch_size,
                                     len,
                                     k,
@@ -1126,6 +1145,9 @@ void select_k_impl(raft::resources const& res,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] in_indptr
+ *   CSR indptr of the index matrix, which indicates the length for each row.
+ *   `nullptr` by default, under this situation, @p len is used as the length.
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& res,
@@ -1136,7 +1158,8 @@ void select_k(raft::resources const& res,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min)
+              bool select_min,
+              const IdxT* in_indptr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1155,6 +1178,7 @@ void select_k(raft::resources const& res,
                                             num_of_warp,
                                             in,
                                             in_idx,
+                                            in_indptr,
                                             batch_size,
                                             len,
                                             k,
@@ -1170,6 +1194,7 @@ void select_k(raft::resources const& res,
                                            num_of_warp,
                                            in,
                                            in_idx,
+                                           in_indptr,
                                            batch_size,
                                            len,
                                            k,
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..08bdfa6f30
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+#include <cuda_fp16.h>  // __half
+
+#include <cstdint>  // uint32_t
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::sparse::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted                   = false,
+              raft::matrix::SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
+}  // namespace raft::sparse::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  extern template void raft::sparse::matrix::detail::select_k(        \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
new file mode 100644
index 0000000000..5f39affce6
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/detail/select_k-inl.cuh>
+#include <raft/matrix/select_k_types.hpp>
+
+#include <cub/cub.cuh>
+
+#include <type_traits>
+
+namespace raft::sparse::matrix::detail {
+
+using namespace raft::matrix::detail;
+using raft::matrix::SelectAlgo;
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
+ *
+ * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ * @tparam NZType
+ *   Type representing non-zero elements of `in_val`.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  auto csr_view = in_val.structure_view();
+  auto nnz      = csr_view.get_nnz();
+
+  if (nnz == 0) return;
+
+  auto batch_size = csr_view.get_n_rows();
+  auto len        = csr_view.get_n_cols();
+  auto k          = IdxT(out_val.extent(1));
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "sparse::matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
+                 "nnz of in_val must be equal to the length of in_idx");
+  }
+  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
+
+  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
+
+  auto indptr = csr_view.get_indptr().data();
+
+  switch (algo) {
+    case SelectAlgo::kRadix8bits:
+    case SelectAlgo::kRadix11bits:
+    case SelectAlgo::kRadix11bitsExtraPass: {
+      if (algo == SelectAlgo::kRadix8bits) {
+        select::radix::select_k<T, IdxT, 8, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          true,
+          indptr);
+      } else {
+        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
+        select::radix::select_k<T, IdxT, 11, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          fused_last_filter,
+          indptr);
+      }
+
+      if (sorted) {
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
+
+        auto keys =
+          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
+        auto vals =
+          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
+
+        segmented_sort_by_key<T, IdxT>(
+          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
+      }
+
+      return;
+    }
+    case SelectAlgo::kWarpDistributed:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpDistributedShm:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed_ext>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpAuto:
+      return select::warpsort::select_k<T, IdxT>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpImmediate:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_immediate>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpFiltered:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_filtered>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    default: RAFT_FAIL("K-selection Algorithm not supported.");
+  }
+
+  return;
+}
+
+}  // namespace raft::sparse::matrix::detail
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k.cuh b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
new file mode 100644
index 0000000000..711169984b
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "select_k-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
new file mode 100644
index 0000000000..3f97e60c99
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/sparse/matrix/detail/select_k.cuh>
+
+#include <optional>
+
+namespace raft::sparse::matrix {
+
+using SelectAlgo = raft::matrix::SelectAlgo;
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ * If the total number of values in a row is less than K, then the extra position in the
+ * corresponding row of out_val will maintain the original value. This applies to out_idx
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  return detail::select_k<T, IdxT>(
+    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
+}
+/** @} */  // end of group select_k
+
+}  // namespace raft::sparse::matrix
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
new file mode 100644
index 0000000000..c784b50dad
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
new file mode 100644
index 0000000000..98bab9a504
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#include <cstdint>  // uint32_t
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int32.cu b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
new file mode 100644
index 0000000000..bff213ae69
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..412b06e587
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
new file mode 100644
index 0000000000..8ba3f0e22b
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
new file mode 100644
index 0000000000..24c844f8c8
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
new file mode 100644
index 0000000000..d63dc64933
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 20ed3bacc7..4d17aacffd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -320,6 +320,7 @@ if(BUILD_TESTS)
     test/sparse/reduce.cu
     test/sparse/row_op.cu
     test/sparse/sddmm.cu
+    test/sparse/select_k_csr.cu
     test/sparse/sort.cu
     test/sparse/spgemmi.cu
     test/sparse/spmm.cu
diff --git a/cpp/test/sparse/select_k_csr.cu b/cpp/test/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..fc1061d7bb
--- /dev/null
+++ b/cpp/test/sparse/select_k_csr.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace sparse {
+
+template <typename index_t>
+struct SelectKCsrInputs {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min;
+  bool customized_indices;
+};
+
+template <typename T>
+struct CompareApproxWithInf {
+  CompareApproxWithInf(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    if (std::isinf(a) && std::isinf(b)) return true;
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename value_t, typename index_t>
+class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>> {
+ public:
+  SelectKCsrTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<SelectKCsrInputs<index_t>>::GetParam()),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      dst_values_d(0, stream),
+      dst_values_expected_d(0, stream),
+      dst_indices_d(0, stream),
+      dst_indices_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_select_k(const std::vector<index_t>& indptr_h,
+                    const std::vector<index_t>& indices_h,
+                    const std::vector<value_t>& values_h,
+                    std::optional<std::vector<index_t>>& in_idx_h,
+                    index_t n_rows,
+                    index_t n_cols,
+                    index_t top_k,
+                    std::vector<value_t>& out_values_h,
+                    std::vector<index_t>& out_indices_h,
+                    bool select_min = true)
+  {
+    auto comp = [select_min](const std::pair<value_t, index_t>& a,
+                             const std::pair<value_t, index_t>& b) {
+      return select_min ? a.first < b.first : a.first >= b.first;
+    };
+
+    for (index_t row = 0; row < n_rows; ++row) {
+      std::priority_queue<std::pair<value_t, index_t>,
+                          std::vector<std::pair<value_t, index_t>>,
+                          decltype(comp)>
+        pq(comp);
+
+      for (index_t idx = indptr_h[row]; idx < indptr_h[row + 1]; ++idx) {
+        pq.push({values_h[idx], (in_idx_h.has_value()) ? (*in_idx_h)[idx] : indices_h[idx]});
+        if (pq.size() > size_t(top_k)) { pq.pop(); }
+      }
+
+      std::vector<std::pair<value_t, index_t>> row_pairs;
+      while (!pq.empty()) {
+        row_pairs.push_back(pq.top());
+        pq.pop();
+      }
+
+      if (select_min) {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first <= b.first;
+        });
+      } else {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first >= b.first;
+        });
+      }
+      for (index_t col = 0; col < top_k; col++) {
+        if (col < index_t(row_pairs.size())) {
+          out_values_h[row * top_k + col]  = row_pairs[col].first;
+          out_indices_h[row * top_k + col] = row_pairs[col].second;
+        }
+      }
+    }
+  }
+
+  void random_array(value_t* array, size_t size)
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<value_t> dis(-10.0, 10.0);
+    std::unordered_set<value_t> uset;
+
+    while (uset.size() < size) {
+      uset.insert(dis(gen));
+    }
+    typename std::unordered_set<value_t>::iterator it = uset.begin();
+    for (size_t i = 0; i < size; ++i) {
+      array[i] = *(it++);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void SetUp() override
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<value_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k,
+                                      std::numeric_limits<value_t>::infinity());
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k, static_cast<index_t>(0));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+
+    resource::sync_stream(handle);
+
+    if (values_h.size()) {
+      random_array(values_h.data(), values_h.size());
+      raft::copy(values_d.data(), values_h.data(), values_h.size(), stream);
+      resource::sync_stream(handle);
+    }
+
+    auto optional_indices_h = get_opt_var(customized_indices_h);
+
+    cpu_select_k(indptr_h,
+                 indices_h,
+                 values_h,
+                 optional_indices_h,
+                 params.n_rows,
+                 params.n_cols,
+                 params.top_k,
+                 dst_values_h,
+                 dst_indices_h,
+                 params.select_min);
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(
+      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(
+      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+
+    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
+                                           out_idx.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           raft::Compare<index_t>(),
+                                           stream));
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           out_val.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           CompareApproxWithInf<value_t>(1e-6f),
+                                           stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  SelectKCsrInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<value_t> dst_values_expected_d;
+
+  rmm::device_uvector<index_t> dst_indices_d;
+  rmm::device_uvector<index_t> dst_indices_expected_d;
+};
+
+using SelectKCsrTest_float_int = SelectKCsrTest<float, int>;
+TEST_P(SelectKCsrTest_float_int, Result) { Run(); }
+
+using SelectKCsrTest_double_int64 = SelectKCsrTest<double, int64_t>;
+TEST_P(SelectKCsrTest_double_int64, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
+  {10, 32, 10, 0.0, true, false},
+  {10, 32, 10, 0.0, true, true},
+  {10, 32, 10, 0.01, true, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, true, true},
+  {10, 32, 251, 0.1, true, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, true, true},
+  {1000, 1024 * 100, 1, 0.1, true, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, true, true},
+  {1024, 1024, 258, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, true, true},
+  {1024, 1024, 1024, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, true, true},
+  {100, 1024 * 1000, 251, 0.1, true, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, true, true},
+  {1024, 1024 * 10, 251, 0.3, true, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, true, true},
+  {1000, 1024 * 20, 1000, 0.2, true, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 1000, 0.2, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 2100, 0.1, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, true, true},
+  {10, 32, 10, 0.0, false, false},
+  {10, 32, 10, 0.0, false, true},
+  {10, 32, 10, 0.01, false, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, false, true},
+  {10, 32, 251, 0.1, false, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, false, true},
+  {1000, 1024 * 100, 1, 0.1, false, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, false, true},
+  {1024, 1024, 258, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, false, true},
+  {1024, 1024, 1024, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, false, true},
+  {100, 1024 * 1000, 251, 0.1, false, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, false, true},
+  {1024, 1024 * 10, 251, 0.3, false, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, false, true},
+  {1000, 1024 * 20, 1000, 0.2, false, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 1000, 0.2, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 2100, 0.1, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, false, true}};
+
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_float_int,
+                        ::testing::ValuesIn(selectk_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_double_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+}  // namespace sparse
+}  // namespace raft