rapidsai · ahendriksen · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
@@ -269,6 +269,12 @@ if(RAFT_COMPILE_LIBRARY)
     src/cluster/update_centroids_double.cu
     src/cluster/cluster_cost_float.cu
     src/cluster/cluster_cost_double.cu
+    src/matrix/detail/select_k_double_int64_t.cu
+    src/matrix/detail/select_k_double_uint32_t.cu
+    src/matrix/detail/select_k_float_int64_t.cu
+    src/matrix/detail/select_k_float_uint32_t.cu
+    src/matrix/detail/select_k_half_int64_t.cu
+    src/matrix/detail/select_k_half_uint32_t.cu
     src/neighbors/refine_d_int64_t_float.cu
     src/neighbors/refine_d_int64_t_int8_t.cu
     src/neighbors/refine_d_int64_t_uint8_t.cu
@@ -327,11 +333,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
     src/distance/specializations/fused_l2_nn_float_int64.cu
-    src/matrix/select_k_float_int64_t.cu
-    src/matrix/specializations/detail/select_k_float_uint32_t.cu
-    src/matrix/specializations/detail/select_k_float_int64_t.cu
-    src/matrix/specializations/detail/select_k_half_uint32_t.cu
-    src/matrix/specializations/detail/select_k_half_int64_t.cu
     src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_serialize.cu
@@ -434,6 +435,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
     src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
     src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
+    src/raft_runtime/matrix/select_k_float_int64_t.cu
     src/random/rmat_rectangular_generator_int_double.cu
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
@@ -463,7 +465,13 @@ if(RAFT_COMPILE_LIBRARY)
     raft_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
-  target_compile_definitions(raft_lib INTERFACE "RAFT_COMPILED")
+
+  # RAFT_COMPILED is set during compilation of libraft.so as well as downstream
+  # libraries (due to "PUBLIC")
+  target_compile_definitions(raft_lib PUBLIC "RAFT_COMPILED")
+  # RAFT_EXPLICIT_INSTANTIATE_ONLY is set during compilation of libraft.so (due
+  # to "PRIVATE")
+  target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")

@@ -54,6 +54,7 @@ function(ConfigureBench)
     ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
+  target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
 
   target_include_directories(
     ${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                   // uint32_t
+#include <cuda_fp16.h>                               // __half
+#include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+}  // namespace raft::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
+  extern template void raft::matrix::detail::select_k(const T* in_val,              \
+                                                      const IdxT* in_idx,           \
+                                                      size_t batch_size,            \
+                                                      size_t len,                   \
+                                                      int k,                        \
+                                                      T* out_val,                   \
+                                                      IdxT* out_idx,                \
+                                                      bool select_min,              \
+                                                      rmm::cuda_stream_view stream, \
+                                                      rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_matrix_detail_select_k(float, int64_t);
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+// We did not have these two for double before, but there are tests for them. We
+// therefore include them here.
+instantiate_raft_matrix_detail_select_k(double, int64_t);
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "select_radix.cuh"
+#include "select_warpsort.cuh"
+
+#include <raft/core/nvtx.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::matrix::detail {
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] in_val
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_val.
+ * @param batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param k
+ *   the number of outputs to select in each input row.
+ * @param[out] out_val
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_val`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out_val`.
+ * @param select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
+ */
+template <typename T, typename IdxT>
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+  // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
+  const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
+  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
+    select::warpsort::select_k<T, IdxT>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+  } else {
+    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
+  }
+}
+
+}  // namespace raft::matrix::detail
@@ -16,76 +16,10 @@
 
 #pragma once
 
-#include "select_radix.cuh"
-#include "select_warpsort.cuh"
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
+#include "select_k-inl.cuh"
+#endif
 
-#include <raft/core/nvtx.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-
-namespace raft::matrix::detail {
-
-/**
- * Select k smallest or largest key/values from each row in the input data.
- *
- * If you think of the input data `in_val` as a row-major matrix with `len` columns and
- * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
- * in the row-major matrix `out_val` of size (batch_size, k).
- *
- * @tparam T
- *   the type of the keys (what is being compared).
- * @tparam IdxT
- *   the index type (what is being selected together with the keys).
- *
- * @param[in] in_val
- *   contiguous device array of inputs of size (len * batch_size);
- *   these are compared and selected.
- * @param[in] in_idx
- *   contiguous device array of inputs of size (len * batch_size);
- *   typically, these are indices of the corresponding in_val.
- * @param batch_size
- *   number of input rows, i.e. the batch size.
- * @param len
- *   length of a single input array (row); also sometimes referred as n_cols.
- *   Invariant: len >= k.
- * @param k
- *   the number of outputs to select in each input row.
- * @param[out] out_val
- *   contiguous device array of outputs of size (k * batch_size);
- *   the k smallest/largest values from each row of the `in_val`.
- * @param[out] out_idx
- *   contiguous device array of outputs of size (k * batch_size);
- *   the payload selected together with `out_val`.
- * @param select_min
- *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
- */
-template <typename T, typename IdxT>
-void select_k(const T* in_val,
-              const IdxT* in_idx,
-              size_t batch_size,
-              size_t len,
-              int k,
-              T* out_val,
-              IdxT* out_idx,
-              bool select_min,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
-  // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
-  const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
-  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
-    select::warpsort::select_k<T, IdxT>(
-      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
-  } else {
-    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
-      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
-  }
-}
-
-}  // namespace raft::matrix::detail
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
@@ -16,32 +16,8 @@
 
 #pragma once
 
-#include <raft/matrix/detail/select_k.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                                      \
-  extern template void select_k<T, IdxT>(const T*,              \
-                                         const IdxT*,           \
-                                         size_t,                \
-                                         size_t,                \
-                                         int,                   \
-                                         T*,                    \
-                                         IdxT*,                 \
-                                         bool,                  \
-                                         rmm::cuda_stream_view, \
-                                         rmm::mr::device_memory_resource*);
-
-// Commonly used types
-RAFT_INST(float, int64_t);
-RAFT_INST(half, int64_t);
-
-// These instances are used in the ivf_pq::search parameterized by the internal_distance_dtype
-RAFT_INST(float, uint32_t);
-RAFT_INST(half, uint32_t);
-
-#undef RAFT_INST
-
-}  // namespace raft::matrix::detail
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")