Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PoC] select_k: Replace specialization by split header #1434

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,12 @@ if(RAFT_COMPILE_LIBRARY)
src/cluster/update_centroids_double.cu
src/cluster/cluster_cost_float.cu
src/cluster/cluster_cost_double.cu
src/matrix/detail/select_k_double_int64_t.cu
src/matrix/detail/select_k_double_uint32_t.cu
src/matrix/detail/select_k_float_int64_t.cu
src/matrix/detail/select_k_float_uint32_t.cu
src/matrix/detail/select_k_half_int64_t.cu
src/matrix/detail/select_k_half_uint32_t.cu
src/neighbors/refine_d_int64_t_float.cu
src/neighbors/refine_d_int64_t_int8_t.cu
src/neighbors/refine_d_int64_t_uint8_t.cu
Expand Down Expand Up @@ -327,11 +333,6 @@ if(RAFT_COMPILE_LIBRARY)
src/distance/specializations/fused_l2_nn_double_int64.cu
src/distance/specializations/fused_l2_nn_float_int.cu
src/distance/specializations/fused_l2_nn_float_int64.cu
src/matrix/select_k_float_int64_t.cu
src/matrix/specializations/detail/select_k_float_uint32_t.cu
src/matrix/specializations/detail/select_k_float_int64_t.cu
src/matrix/specializations/detail/select_k_half_uint32_t.cu
src/matrix/specializations/detail/select_k_half_int64_t.cu
src/neighbors/ivfpq_build.cu
src/neighbors/ivfpq_deserialize.cu
src/neighbors/ivfpq_serialize.cu
Expand Down Expand Up @@ -434,6 +435,7 @@ if(RAFT_COMPILE_LIBRARY)
src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
src/raft_runtime/matrix/select_k_float_int64_t.cu
src/random/rmat_rectangular_generator_int_double.cu
src/random/rmat_rectangular_generator_int64_double.cu
src/random/rmat_rectangular_generator_int_float.cu
Expand Down Expand Up @@ -463,7 +465,13 @@ if(RAFT_COMPILE_LIBRARY)
raft_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
)
target_compile_definitions(raft_lib INTERFACE "RAFT_COMPILED")

# RAFT_COMPILED is set during compilation of libraft.so as well as downstream
# libraries (due to "PUBLIC")
target_compile_definitions(raft_lib PUBLIC "RAFT_COMPILED")
# RAFT_EXPLICIT_INSTANTIATE_ONLY is set during compilation of libraft.so (due
# to "PRIVATE")
target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")

# ensure CUDA symbols aren't relocated to the middle of the debug build binaries
target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
Expand Down
1 change: 1 addition & 0 deletions cpp/bench/prims/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ function(ConfigureBench)
${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
)
target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")

target_include_directories(
${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"
Expand Down
65 changes: 65 additions & 0 deletions cpp/include/raft/matrix/detail/select_k-ext.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstdint> // uint32_t
#include <cuda_fp16.h> // __half
#include <raft/util/raft_explicit.hpp> // RAFT_EXPLICIT
#include <rmm/cuda_stream_view.hpp> // rmm:cuda_stream_view
#include <rmm/mr/device/device_memory_resource.hpp> // rmm::mr::device_memory_resource

#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY

namespace raft::matrix::detail {

template <typename T, typename IdxT>
void select_k(const T* in_val,
const IdxT* in_idx,
size_t batch_size,
size_t len,
int k,
T* out_val,
IdxT* out_idx,
bool select_min,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
} // namespace raft::matrix::detail

#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY

#define instantiate_raft_matrix_detail_select_k(T, IdxT) \
extern template void raft::matrix::detail::select_k(const T* in_val, \
const IdxT* in_idx, \
size_t batch_size, \
size_t len, \
int k, \
T* out_val, \
IdxT* out_idx, \
bool select_min, \
rmm::cuda_stream_view stream, \
rmm::mr::device_memory_resource* mr)

instantiate_raft_matrix_detail_select_k(__half, uint32_t);
instantiate_raft_matrix_detail_select_k(__half, int64_t);
instantiate_raft_matrix_detail_select_k(float, int64_t);
instantiate_raft_matrix_detail_select_k(float, uint32_t);
// We did not have these two for double before, but there are tests for them. We
// therefore include them here.
instantiate_raft_matrix_detail_select_k(double, int64_t);
instantiate_raft_matrix_detail_select_k(double, uint32_t);

#undef instantiate_raft_matrix_detail_select_k
91 changes: 91 additions & 0 deletions cpp/include/raft/matrix/detail/select_k-inl.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "select_radix.cuh"
#include "select_warpsort.cuh"

#include <raft/core/nvtx.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

namespace raft::matrix::detail {

/**
* Select k smallest or largest key/values from each row in the input data.
*
* If you think of the input data `in_val` as a row-major matrix with `len` columns and
* `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
* in the row-major matrix `out_val` of size (batch_size, k).
*
* @tparam T
* the type of the keys (what is being compared).
* @tparam IdxT
* the index type (what is being selected together with the keys).
*
* @param[in] in_val
* contiguous device array of inputs of size (len * batch_size);
* these are compared and selected.
* @param[in] in_idx
* contiguous device array of inputs of size (len * batch_size);
* typically, these are indices of the corresponding in_val.
* @param batch_size
* number of input rows, i.e. the batch size.
* @param len
* length of a single input array (row); also sometimes referred as n_cols.
* Invariant: len >= k.
* @param k
* the number of outputs to select in each input row.
* @param[out] out_val
* contiguous device array of outputs of size (k * batch_size);
* the k smallest/largest values from each row of the `in_val`.
* @param[out] out_idx
* contiguous device array of outputs of size (k * batch_size);
* the payload selected together with `out_val`.
* @param select_min
* whether to select k smallest (true) or largest (false) keys.
* @param stream
* @param mr an optional memory resource to use across the calls (you can provide a large enough
* memory pool here to avoid memory allocations within the call).
*/
template <typename T, typename IdxT>
void select_k(const T* in_val,
const IdxT* in_idx,
size_t batch_size,
size_t len,
int k,
T* out_val,
IdxT* out_idx,
bool select_min,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = nullptr)
{
common::nvtx::range<common::nvtx::domain::raft> fun_scope(
"matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
// TODO (achirkin): investigate the trade-off for a wider variety of inputs.
const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
select::warpsort::select_k<T, IdxT>(
in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
} else {
select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
}
}

} // namespace raft::matrix::detail
78 changes: 6 additions & 72 deletions cpp/include/raft/matrix/detail/select_k.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,76 +16,10 @@

#pragma once

#include "select_radix.cuh"
#include "select_warpsort.cuh"
#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
#include "select_k-inl.cuh"
#endif

#include <raft/core/nvtx.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

namespace raft::matrix::detail {

/**
* Select k smallest or largest key/values from each row in the input data.
*
* If you think of the input data `in_val` as a row-major matrix with `len` columns and
* `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
* in the row-major matrix `out_val` of size (batch_size, k).
*
* @tparam T
* the type of the keys (what is being compared).
* @tparam IdxT
* the index type (what is being selected together with the keys).
*
* @param[in] in_val
* contiguous device array of inputs of size (len * batch_size);
* these are compared and selected.
* @param[in] in_idx
* contiguous device array of inputs of size (len * batch_size);
* typically, these are indices of the corresponding in_val.
* @param batch_size
* number of input rows, i.e. the batch size.
* @param len
* length of a single input array (row); also sometimes referred as n_cols.
* Invariant: len >= k.
* @param k
* the number of outputs to select in each input row.
* @param[out] out_val
* contiguous device array of outputs of size (k * batch_size);
* the k smallest/largest values from each row of the `in_val`.
* @param[out] out_idx
* contiguous device array of outputs of size (k * batch_size);
* the payload selected together with `out_val`.
* @param select_min
* whether to select k smallest (true) or largest (false) keys.
* @param stream
* @param mr an optional memory resource to use across the calls (you can provide a large enough
* memory pool here to avoid memory allocations within the call).
*/
template <typename T, typename IdxT>
void select_k(const T* in_val,
const IdxT* in_idx,
size_t batch_size,
size_t len,
int k,
T* out_val,
IdxT* out_idx,
bool select_min,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = nullptr)
{
common::nvtx::range<common::nvtx::domain::raft> fun_scope(
"matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
// TODO (achirkin): investigate the trade-off for a wider variety of inputs.
const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
select::warpsort::select_k<T, IdxT>(
in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
} else {
select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
}
}

} // namespace raft::matrix::detail
#ifdef RAFT_COMPILED
#include "select_k-ext.cuh"
#endif
34 changes: 5 additions & 29 deletions cpp/include/raft/matrix/specializations/detail/select_k.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,8 @@

#pragma once

#include <raft/matrix/detail/select_k.cuh>

#include <cuda_fp16.h>

namespace raft::matrix::detail {

#define RAFT_INST(T, IdxT) \
extern template void select_k<T, IdxT>(const T*, \
const IdxT*, \
size_t, \
size_t, \
int, \
T*, \
IdxT*, \
bool, \
rmm::cuda_stream_view, \
rmm::mr::device_memory_resource*);

// Commonly used types
RAFT_INST(float, int64_t);
RAFT_INST(half, int64_t);

// These instances are used in the ivf_pq::search parameterized by the internal_distance_dtype
RAFT_INST(float, uint32_t);
RAFT_INST(half, uint32_t);

#undef RAFT_INST

} // namespace raft::matrix::detail
#pragma message( \
__FILE__ \
" is deprecated and will be removed." \
" Including specializations is not necessary any more." \
" For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
Loading