Skip to content

Commit

Permalink
Add explicit template instantiations for CAGRA
Browse files Browse the repository at this point in the history
  • Loading branch information
tfeher committed Jul 17, 2023
1 parent 700112f commit 5e74e7e
Show file tree
Hide file tree
Showing 42 changed files with 2,530 additions and 1 deletion.
24 changes: 24 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,30 @@ if(RAFT_COMPILE_LIBRARY)
src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
src/neighbors/brute_force_knn_int_float_int.cu
src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu
src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu
src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu
src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu
src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu
src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <raft/util/raft_explicit.hpp> // RAFT_EXPLICIT

namespace raft::neighbors::experimental::cagra::detail {
namespace multi_cta_search {

#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY

template <unsigned TEAM_SIZE,
unsigned MAX_DATASET_DIM,
class DATA_T,
class INDEX_T,
class DISTANCE_T>
void select_and_run(raft::device_matrix_view<const DATA_T, INDEX_T, layout_stride> dataset,
raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
INDEX_T* const topk_indices_ptr,
DISTANCE_T* const topk_distances_ptr,
const DATA_T* const queries_ptr,
const uint32_t num_queries,
const INDEX_T* dev_seed_ptr,
uint32_t* const num_executed_iterations,
uint32_t topk,
uint32_t block_size,
uint32_t result_buffer_size,
uint32_t smem_size,
int64_t hash_bitlen,
INDEX_T* hashmap_ptr,
uint32_t num_cta_per_query,
uint32_t num_random_samplings,
uint64_t rand_xor_mask,
uint32_t num_seeds,
size_t itopk_size,
size_t num_parents,
size_t min_iterations,
size_t max_iterations,
cudaStream_t stream) RAFT_EXPLICIT;
#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY

#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
raft::device_matrix_view<const DATA_T, INDEX_T, layout_stride> dataset, \
raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph, \
INDEX_T* const topk_indices_ptr, \
DISTANCE_T* const topk_distances_ptr, \
const DATA_T* const queries_ptr, \
const uint32_t num_queries, \
const INDEX_T* dev_seed_ptr, \
uint32_t* const num_executed_iterations, \
uint32_t topk, \
uint32_t block_size, \
uint32_t result_buffer_size, \
uint32_t smem_size, \
int64_t hash_bitlen, \
INDEX_T* hashmap_ptr, \
uint32_t num_cta_per_query, \
uint32_t num_random_samplings, \
uint64_t rand_xor_mask, \
uint32_t num_seeds, \
size_t itopk_size, \
size_t num_parents, \
size_t min_iterations, \
size_t max_iterations, \
cudaStream_t stream);

instantiate_kernel_selection(32, 1024, float, uint32_t, float);
instantiate_kernel_selection(8, 128, float, uint32_t, float);
instantiate_kernel_selection(16, 256, float, uint32_t, float);
instantiate_kernel_selection(32, 512, float, uint32_t, float);
instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float);
instantiate_kernel_selection(8, 128, int8_t, uint32_t, float);
instantiate_kernel_selection(16, 256, int8_t, uint32_t, float);
instantiate_kernel_selection(32, 512, int8_t, uint32_t, float);
instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float);
instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float);
instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float);
instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float);

#undef instantiate_kernel_selection
} // namespace multi_cta_search
} // namespace raft::neighbors::experimental::cagra::detail
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,10 @@
*/
#pragma once

#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
#include "search_multi_cta_kernel-inl.cuh"
#endif

#ifdef RAFT_COMPILED
#include "search_multi_cta_kernel-ext.cuh"
#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <raft/util/raft_explicit.hpp> // RAFT_EXPLICIT
namespace raft::neighbors::experimental::cagra::detail {
namespace single_cta_search {

#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY

template <unsigned TEAM_SIZE,
unsigned MAX_DATASET_DIM,
typename DATA_T,
typename INDEX_T,
typename DISTANCE_T>
void select_and_run( // raft::resources const& res,
raft::device_matrix_view<const DATA_T, INDEX_T, layout_stride> dataset,
raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
INDEX_T* const topk_indices_ptr, // [num_queries, topk]
DISTANCE_T* const topk_distances_ptr, // [num_queries, topk]
const DATA_T* const queries_ptr, // [num_queries, dataset_dim]
const uint32_t num_queries,
const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds]
uint32_t* const num_executed_iterations, // [num_queries,]
uint32_t topk,
uint32_t num_itopk_candidates,
uint32_t block_size,
uint32_t smem_size,
int64_t hash_bitlen,
INDEX_T* hashmap_ptr,
size_t small_hash_bitlen,
size_t small_hash_reset_interval,
uint32_t num_random_samplings,
uint64_t rand_xor_mask,
uint32_t num_seeds,
size_t itopk_size,
size_t num_parents,
size_t min_iterations,
size_t max_iterations,
cudaStream_t stream) RAFT_EXPLICIT;

#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY

#define instantiate_single_cta_select_and_run( \
TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \
extern template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \
raft::device_matrix_view<const DATA_T, INDEX_T, layout_stride> dataset, \
raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph, \
INDEX_T* const topk_indices_ptr, \
DISTANCE_T* const topk_distances_ptr, \
const DATA_T* const queries_ptr, \
const uint32_t num_queries, \
const INDEX_T* dev_seed_ptr, \
uint32_t* const num_executed_iterations, \
uint32_t topk, \
uint32_t num_itopk_candidates, \
uint32_t block_size, \
uint32_t smem_size, \
int64_t hash_bitlen, \
INDEX_T* hashmap_ptr, \
size_t small_hash_bitlen, \
size_t small_hash_reset_interval, \
uint32_t num_random_samplings, \
uint64_t rand_xor_mask, \
uint32_t num_seeds, \
size_t itopk_size, \
size_t num_parents, \
size_t min_iterations, \
size_t max_iterations, \
cudaStream_t stream);

instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float);
instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float);
instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float);
instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float);
instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float);
instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float);
instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float);
instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float);
instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float);
instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float);
instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float);
instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float);

#undef instantiate_single_cta_select_and_run

} // namespace single_cta_search
} // namespace raft::neighbors::experimental::cagra::detail
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,10 @@
*/
#pragma once

#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
#include "search_single_cta_kernel-inl.cuh"
#endif

#ifdef RAFT_COMPILED
#include "search_single_cta_kernel-ext.cuh"
#endif
101 changes: 101 additions & 0 deletions cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

header = """
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* NOTE: this file is generated by search_multi_cta_00_generate.py
*
* Make changes there and run in this directory:
*
* > python search_multi_cta_00_generate.py
*
*/
#include <raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh>
namespace raft::neighbors::experimental::cagra::detail::multi_cta_search {
#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \\
template void select_and_run<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>( \\
raft::device_matrix_view<const DATA_T, INDEX_T, layout_stride> dataset, \\
raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph, \\
INDEX_T* const topk_indices_ptr, \\
DISTANCE_T* const topk_distances_ptr, \\
const DATA_T* const queries_ptr, \\
const uint32_t num_queries, \\
const INDEX_T* dev_seed_ptr, \\
uint32_t* const num_executed_iterations, \\
uint32_t topk, \\
uint32_t block_size, \\
uint32_t result_buffer_size, \\
uint32_t smem_size, \\
int64_t hash_bitlen, \\
INDEX_T* hashmap_ptr, \\
uint32_t num_cta_per_query, \\
uint32_t num_random_samplings, \\
uint64_t rand_xor_mask, \\
uint32_t num_seeds, \\
size_t itopk_size, \\
size_t num_parents, \\
size_t min_iterations, \\
size_t max_iterations, \\
cudaStream_t stream);
"""

trailer = """
#undef instantiate_kernel_selection
} // namespace raft::neighbors::experimental::cagra::detail::namespace multi_cta_search
"""

mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)]
# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
# mxelem = [64, 128, 256]
load_types = ["uint4"]
search_types = dict(
float_uint32=("float", "uint32_t", "float"), # data_t, idx_t, distance_t
int8_uint32=("int8_t", "uint32_t", "float"),
uint8_uint32=("uint8_t", "uint32_t", "float"),
float_uint64=("float", "uint64_t", "float"),
)

# knn
for type_path, (data_t, idx_t, distance_t) in search_types.items():
for (mxdim, team) in mxdim_team:
path = f"search_multi_cta_{type_path}_dim{mxdim}_t{team}.cu"
with open(path, "w") as f:
f.write(header)
f.write(
f"instantiate_kernel_selection({team}, {mxdim}, {data_t}, {idx_t}, {distance_t});\n"
)
f.write(trailer)
# For pasting into CMakeLists.txt
print(f"src/neighbors/detail/cagra/{path}")
Loading

0 comments on commit 5e74e7e

Please sign in to comment.