From 4c06608e51d8839a4e98a1ee7f5a30fbfaaac56b Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Fri, 19 Jan 2024 07:21:25 +0100 Subject: [PATCH 1/2] Support for fp16 in CAGRA and IVF-PQ (#2085) Add fp16 (CUDA half) support to CAGRA and its dependencies. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - tsuki (https://github.com/enp1s0) URL: https://github.com/rapidsai/raft/pull/2085 --- cpp/CMakeLists.txt | 14 ++ .../core/detail/mdspan_numpy_serializer.hpp | 14 +- .../cagra/search_multi_cta_kernel-ext.cuh | 238 +++++++++--------- .../cagra/search_single_cta_kernel-ext.cuh | 10 + .../detail/ivf_flat_interleaved_scan-ext.cuh | 4 + .../detail/ivf_flat_interleaved_scan-inl.cuh | 2 +- .../neighbors/detail/ivf_flat_search-ext.cuh | 4 + .../raft/neighbors/detail/ivf_pq_build.cuh | 8 +- .../raft/neighbors/detail/ivf_pq_search.cuh | 3 +- .../raft/neighbors/detail/refine_host-ext.hpp | 9 + cpp/include/raft/neighbors/ivf_pq-ext.cuh | 3 + cpp/include/raft/util/device_loads_stores.cuh | 144 +++++++++++ .../cagra/search_multi_cta_00_generate.py | 2 + ...earch_multi_cta_half_uint32_dim1024_t32.cu | 66 +++++ .../search_multi_cta_half_uint32_dim128_t8.cu | 66 +++++ ...search_multi_cta_half_uint32_dim256_t16.cu | 66 +++++ ...search_multi_cta_half_uint32_dim512_t32.cu | 66 +++++ ...earch_multi_cta_half_uint64_dim1024_t32.cu | 66 +++++ .../search_multi_cta_half_uint64_dim128_t8.cu | 66 +++++ ...search_multi_cta_half_uint64_dim256_t16.cu | 66 +++++ ...search_multi_cta_half_uint64_dim512_t32.cu | 66 +++++ .../cagra/search_single_cta_00_generate.py | 2 + ...arch_single_cta_half_uint32_dim1024_t32.cu | 67 +++++ ...search_single_cta_half_uint32_dim128_t8.cu | 67 +++++ ...earch_single_cta_half_uint32_dim256_t16.cu | 67 +++++ ...earch_single_cta_half_uint32_dim512_t32.cu | 67 +++++ ...arch_single_cta_half_uint64_dim1024_t32.cu | 67 +++++ ...search_single_cta_half_uint64_dim128_t8.cu | 67 +++++ ...earch_single_cta_half_uint64_dim256_t16.cu | 67 +++++ ...earch_single_cta_half_uint64_dim512_t32.cu | 67 +++++ ...flat_interleaved_scan_half_half_int64_t.cu | 44 ++++ .../detail/refine_host_half_float.cpp | 31 +++ cpp/src/neighbors/ivfpq_build_half_int64_t.cu | 38 +++ .../neighbors/ivfpq_extend_half_int64_t.cu | 52 ++++ .../neighbors/ivfpq_search_half_int64_t.cu | 44 ++++ cpp/src/neighbors/refine_00_generate.py | 1 + cpp/src/neighbors/refine_half_float.cu | 50 ++++ cpp/src/raft_runtime/neighbors/cagra_build.cu | 3 + .../raft_runtime/neighbors/cagra_search.cu | 3 + .../raft_runtime/neighbors/cagra_serialize.cu | 3 + cpp/test/CMakeLists.txt | 10 + cpp/test/neighbors/ann_cagra.cuh | 55 +++- .../neighbors/ann_cagra/test_half_int64_t.cu | 29 +++ .../neighbors/ann_cagra/test_half_uint32_t.cu | 40 +++ 44 files changed, 1792 insertions(+), 132 deletions(-) create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu create mode 100644 cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_half_half_int64_t.cu create mode 100644 cpp/src/neighbors/detail/refine_host_half_float.cpp create mode 100644 cpp/src/neighbors/ivfpq_build_half_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_extend_half_int64_t.cu create mode 100644 cpp/src/neighbors/ivfpq_search_half_int64_t.cu create mode 100644 cpp/src/neighbors/refine_half_float.cu create mode 100644 cpp/test/neighbors/ann_cagra/test_half_int64_t.cu create mode 100644 cpp/test/neighbors/ann_cagra/test_half_uint32_t.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 38a7812301..650bc1a059 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,6 +346,10 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu @@ -358,6 +362,10 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu @@ -367,6 +375,7 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu + src/neighbors/detail/ivf_flat_interleaved_scan_half_half_int64_t.cu src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu src/neighbors/detail/ivf_flat_search.cu @@ -378,6 +387,7 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu src/neighbors/detail/refine_host_float_float.cpp + src/neighbors/detail/refine_host_half_float.cpp src/neighbors/detail/refine_host_int8_t_float.cpp src/neighbors/detail/refine_host_uint8_t_float.cpp src/neighbors/ivf_flat_build_float_int64_t.cu @@ -390,15 +400,19 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/ivf_flat_search_int8_t_int64_t.cu src/neighbors/ivf_flat_search_uint8_t_int64_t.cu src/neighbors/ivfpq_build_float_int64_t.cu + src/neighbors/ivfpq_build_half_int64_t.cu src/neighbors/ivfpq_build_int8_t_int64_t.cu src/neighbors/ivfpq_build_uint8_t_int64_t.cu src/neighbors/ivfpq_extend_float_int64_t.cu + src/neighbors/ivfpq_extend_half_int64_t.cu src/neighbors/ivfpq_extend_int8_t_int64_t.cu src/neighbors/ivfpq_extend_uint8_t_int64_t.cu src/neighbors/ivfpq_search_float_int64_t.cu + src/neighbors/ivfpq_search_half_int64_t.cu src/neighbors/ivfpq_search_int8_t_int64_t.cu src/neighbors/ivfpq_search_uint8_t_int64_t.cu src/neighbors/refine_float_float.cu + src/neighbors/refine_half_float.cu src/neighbors/refine_int8_t_float.cu src/neighbors/refine_uint8_t_float.cu src/raft_runtime/cluster/cluster_cost.cuh diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp index 168dabc990..176309c8ce 100644 --- a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp +++ b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp @@ -20,6 +20,10 @@ #include #include +#if defined(_RAFT_HAS_CUDA) +#include +#endif + #include #include #include @@ -121,6 +125,14 @@ inline dtype_t get_numpy_dtype() return {RAFT_NUMPY_HOST_ENDIAN_CHAR, 'f', sizeof(T)}; } +#if defined(_RAFT_HAS_CUDA) +template , bool> = true> +inline dtype_t get_numpy_dtype() +{ + return {RAFT_NUMPY_HOST_ENDIAN_CHAR, 'e', sizeof(T)}; +} +#endif + template && std::is_signed_v, bool> = true> inline dtype_t get_numpy_dtype() @@ -273,7 +285,7 @@ inline dtype_t parse_descr(std::string typestr) const char endian_chars[] = { RAFT_NUMPY_LITTLE_ENDIAN_CHAR, RAFT_NUMPY_BIG_ENDIAN_CHAR, RAFT_NUMPY_NO_ENDIAN_CHAR}; - const char numtype_chars[] = {'f', 'i', 'u', 'c'}; + const char numtype_chars[] = {'f', 'i', 'u', 'c', 'e'}; RAFT_EXPECTS(std::find(std::begin(endian_chars), std::end(endian_chars), byteorder_c) != std::end(endian_chars), diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh index 3179586b98..7a5ad17460 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh @@ -1,114 +1,124 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // none_cagra_sample_filter -#include // RAFT_EXPLICIT - -namespace raft::neighbors::cagra::detail { -namespace multi_cta_search { - -#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY - -template -void select_and_run(raft::device_matrix_view dataset, - raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, - DISTANCE_T* const topk_distances_ptr, - const DATA_T* const queries_ptr, - const uint32_t num_queries, - const INDEX_T* dev_seed_ptr, - uint32_t* const num_executed_iterations, - uint32_t topk, - uint32_t block_size, - uint32_t result_buffer_size, - uint32_t smem_size, - int64_t hash_bitlen, - INDEX_T* hashmap_ptr, - uint32_t num_cta_per_query, - uint32_t num_random_samplings, - uint64_t rand_xor_mask, - uint32_t num_seeds, - size_t itopk_size, - size_t search_width, - size_t min_iterations, - size_t max_iterations, - SAMPLE_FILTER_T sample_filter, - cudaStream_t stream) RAFT_EXPLICIT; -#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_kernel_selection( \ - TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ - extern template void \ - select_and_run( \ - raft::device_matrix_view dataset, \ - raft::device_matrix_view graph, \ - INDEX_T* const topk_indices_ptr, \ - DISTANCE_T* const topk_distances_ptr, \ - const DATA_T* const queries_ptr, \ - const uint32_t num_queries, \ - const INDEX_T* dev_seed_ptr, \ - uint32_t* const num_executed_iterations, \ - uint32_t topk, \ - uint32_t block_size, \ - uint32_t result_buffer_size, \ - uint32_t smem_size, \ - int64_t hash_bitlen, \ - INDEX_T* hashmap_ptr, \ - uint32_t num_cta_per_query, \ - uint32_t num_random_samplings, \ - uint64_t rand_xor_mask, \ - uint32_t num_seeds, \ - size_t itopk_size, \ - size_t search_width, \ - size_t min_iterations, \ - size_t max_iterations, \ - SAMPLE_FILTER_T sample_filter, \ - cudaStream_t stream); - -instantiate_kernel_selection( - 32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); -instantiate_kernel_selection( - 32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); - -#undef instantiate_kernel_selection -} // namespace multi_cta_search -} // namespace raft::neighbors::cagra::detail +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include // none_cagra_sample_filter +#include // RAFT_EXPLICIT + +#include + +namespace raft::neighbors::cagra::detail { +namespace multi_cta_search { + +#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY + +template +void select_and_run(raft::device_matrix_view dataset, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, + DISTANCE_T* const topk_distances_ptr, + const DATA_T* const queries_ptr, + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, + uint32_t* const num_executed_iterations, + uint32_t topk, + uint32_t block_size, + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + INDEX_T* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_random_samplings, + uint64_t rand_xor_mask, + uint32_t num_seeds, + size_t itopk_size, + size_t search_width, + size_t min_iterations, + size_t max_iterations, + SAMPLE_FILTER_T sample_filter, + cudaStream_t stream) RAFT_EXPLICIT; +#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + extern template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 32, 1024, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 8, 128, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 1024, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 8, 128, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 16, 256, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 512, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 8, 128, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 16, 256, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 512, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 1024, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 8, 128, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 16, 256, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_kernel_selection( + 32, 512, uint8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection +} // namespace multi_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh index 99e2fbede2..fef060ffee 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh @@ -18,6 +18,8 @@ #include #include // RAFT_EXPLICIT +#include + namespace raft::neighbors::cagra::detail { namespace single_cta_search { @@ -96,6 +98,14 @@ instantiate_single_cta_select_and_run( 16, 256, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); instantiate_single_cta_select_and_run( 32, 512, float, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_single_cta_select_and_run( + 32, 1024, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_single_cta_select_and_run( + 8, 128, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_single_cta_select_and_run( + 16, 256, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); +instantiate_single_cta_select_and_run( + 32, 512, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); instantiate_single_cta_select_and_run( 32, 1024, int8_t, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); instantiate_single_cta_select_and_run( diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh index 6bd8c91157..58e94ee7aa 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh @@ -22,6 +22,8 @@ #include // RAFT_EXPLICIT #include // rmm:cuda_stream_view +#include + #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY namespace raft::neighbors::ivf_flat::detail { @@ -67,6 +69,8 @@ void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index& i instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( float, float, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + half, half, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( int8_t, int32_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh index 6e2821f3de..1cf042c6cd 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh @@ -168,7 +168,7 @@ struct loadAndComputeDist { const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks) { const int loadDim = dimBlocks + lane_id; - T queryReg = loadDim < dim ? query[loadDim] : 0; + T queryReg = loadDim < dim ? query[loadDim] : T{0}; const int loadDataIdx = lane_id * Veclen; for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) { T enc[Veclen]; diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh index a4d885aee9..d7dfe5c363 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh @@ -21,6 +21,8 @@ #include // none_ivf_sample_filter #include // RAFT_EXPLICIT +#include + #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY namespace raft::neighbors::ivf_flat::detail { @@ -56,6 +58,8 @@ void search(raft::resources const& handle, instantiate_raft_neighbors_ivf_flat_detail_search( float, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_search( + half, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_flat_detail_search( int8_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_flat_detail_search( diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 80033c384b..2dfb261f32 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -54,6 +54,8 @@ #include #include +#include + #include #include @@ -1562,7 +1564,8 @@ void extend(raft::resources const& handle, RAFT_EXPECTS(new_indices != nullptr || index->size() == 0, "You must pass data indices when the index is non-empty."); - static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + static_assert(std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v, "Unsupported data type"); rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(handle); @@ -1734,7 +1737,8 @@ auto build(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); - static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + static_assert(std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v, "Unsupported data type"); RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset"); diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 0619ee4503..d000a1a4d3 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -806,7 +806,8 @@ inline void search(raft::resources const& handle, float* distances, IvfSampleFilterT sample_filter = IvfSampleFilterT()) { - static_assert(std::is_same_v || std::is_same_v || std::is_same_v, + static_assert(std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v, "Unsupported element type."); common::nvtx::range fun_scope( "ivf_pq::search(n_queries = %u, n_probes = %u, k = %u, dim = %zu)", diff --git a/cpp/include/raft/neighbors/detail/refine_host-ext.hpp b/cpp/include/raft/neighbors/detail/refine_host-ext.hpp index d807ac5239..a2f75d0f8b 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-ext.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-ext.hpp @@ -18,10 +18,15 @@ #include // int64_t +#include // _RAFT_HAS_CUDA #include // raft::host_matrix_view #include // raft::distance::DistanceType #include // RAFT_EXPLICIT +#if defined(_RAFT_HAS_CUDA) +#include +#endif + #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY namespace raft::neighbors::detail { @@ -52,4 +57,8 @@ instantiate_raft_neighbors_refine(int64_t, float, float, int64_t); instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t); instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t); +#if defined(_RAFT_HAS_CUDA) +instantiate_raft_neighbors_refine(int64_t, half, float, int64_t); +#endif + #undef instantiate_raft_neighbors_refine diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh index 038b2e7263..188219dfa2 100644 --- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh @@ -150,6 +150,7 @@ search(raft::resources const& handle, ->raft::neighbors::ivf_pq::index; instantiate_raft_neighbors_ivf_pq_build(float, int64_t); +instantiate_raft_neighbors_ivf_pq_build(half, int64_t); instantiate_raft_neighbors_ivf_pq_build(int8_t, int64_t); instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t); @@ -184,6 +185,7 @@ instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t); IdxT n_rows); instantiate_raft_neighbors_ivf_pq_extend(float, int64_t); +instantiate_raft_neighbors_ivf_pq_extend(half, int64_t); instantiate_raft_neighbors_ivf_pq_extend(int8_t, int64_t); instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t); @@ -220,6 +222,7 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t); float* distances) instantiate_raft_neighbors_ivf_pq_search(float, int64_t); +instantiate_raft_neighbors_ivf_pq_search(half, int64_t); instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t); instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t); diff --git a/cpp/include/raft/util/device_loads_stores.cuh b/cpp/include/raft/util/device_loads_stores.cuh index 0b5b937245..780939ee85 100644 --- a/cpp/include/raft/util/device_loads_stores.cuh +++ b/cpp/include/raft/util/device_loads_stores.cuh @@ -17,6 +17,7 @@ #pragma once #include // uintX_t +#include #include #include // DI @@ -148,6 +149,57 @@ DI void sts(int32_t* addr, const int32_t (&x)[4]) : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3])); } +DI void sts(half* addr, const half& x) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u16 [%0], {%1};" : : "l"(s), "h"(*reinterpret_cast(&x))); +} +DI void sts(half* addr, const half (&x)[1]) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.u16 [%0], {%1};" : : "l"(s), "h"(*reinterpret_cast(x))); +} +DI void sts(half* addr, const half (&x)[2]) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.u16 [%0], {%1, %2};" + : + : "l"(s), + "h"(*reinterpret_cast(x)), + "h"(*reinterpret_cast(x + 1))); +} +DI void sts(half* addr, const half (&x)[4]) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v4.u16 [%0], {%1, %2, %3, %4};" + : + : "l"(s), + "h"(*reinterpret_cast(x)), + "h"(*reinterpret_cast(x + 1)), + "h"(*reinterpret_cast(x + 2)), + "h"(*reinterpret_cast(x + 3))); +} +DI void sts(half* addr, const half (&x)[8]) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + half2 y[4]; + y[0].x = x[0]; + y[0].y = x[1]; + y[1].x = x[2]; + y[1].y = x[3]; + y[2].x = x[4]; + y[2].y = x[5]; + y[3].x = x[6]; + y[3].y = x[7]; + asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};" + : + : "l"(s), + "r"(*reinterpret_cast(y)), + "r"(*reinterpret_cast(y + 1)), + "r"(*reinterpret_cast(y + 2)), + "r"(*reinterpret_cast(y + 3))); +} + DI void sts(float* addr, const float& x) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); @@ -323,6 +375,52 @@ DI void lds(int32_t& x, const int32_t* addr) asm volatile("ld.shared.u32 {%0}, [%1];" : "=r"(x) : "l"(s1)); } +DI void lds(half& x, const half* addr) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(*reinterpret_cast(&x)) : "l"(s)); +} +DI void lds(half (&x)[1], const half* addr) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.u16 {%0}, [%1];" : "=h"(*reinterpret_cast(x)) : "l"(s)); +} +DI void lds(half (&x)[2], const half* addr) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.u16 {%0, %1}, [%2];" + : "=h"(*reinterpret_cast(x)), "=h"(*reinterpret_cast(x + 1)) + : "l"(s)); +} +DI void lds(half (&x)[4], const half* addr) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(*reinterpret_cast(x)), + "=h"(*reinterpret_cast(x + 1)), + "=h"(*reinterpret_cast(x + 2)), + "=h"(*reinterpret_cast(x + 3)) + : "l"(s)); +} +DI void lds(half (&x)[8], const half* addr) +{ + auto s = __cvta_generic_to_shared(reinterpret_cast(addr)); + half2 y[4]; + asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(*reinterpret_cast(y)), + "=r"(*reinterpret_cast(y + 1)), + "=r"(*reinterpret_cast(y + 2)), + "=r"(*reinterpret_cast(y + 3)) + : "l"(s)); + x[0] = y[0].x; + x[1] = y[0].y; + x[2] = y[1].x; + x[3] = y[1].y; + x[4] = y[2].x; + x[5] = y[2].y; + x[6] = y[3].x; + x[7] = y[3].y; +} DI void lds(float& x, const float* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); @@ -410,6 +508,52 @@ DI void ldg(float (&x)[4], const float* addr) : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(addr)); } +DI void ldg(half& x, const half* addr) +{ + asm volatile("ld.global.cg.u16 {%0}, [%1];" + : "=h"(*reinterpret_cast(&x)) + : "l"(reinterpret_cast(addr))); +} +DI void ldg(half (&x)[1], const half* addr) +{ + asm volatile("ld.global.cg.u16 {%0}, [%1];" + : "=h"(*reinterpret_cast(x)) + : "l"(reinterpret_cast(addr))); +} +DI void ldg(half (&x)[2], const half* addr) +{ + asm volatile("ld.global.cg.v2.u16 {%0, %1}, [%2];" + : "=h"(*reinterpret_cast(x)), "=h"(*reinterpret_cast(x + 1)) + : "l"(reinterpret_cast(addr))); +} +DI void ldg(half (&x)[4], const half* addr) +{ + asm volatile("ld.global.cg.v4.u16 {%0, %1, %2, %3}, [%4];" + : "=h"(*reinterpret_cast(x)), + "=h"(*reinterpret_cast(x + 1)), + "=h"(*reinterpret_cast(x + 2)), + "=h"(*reinterpret_cast(x + 3)) + : "l"(reinterpret_cast(addr))); +} + +DI void ldg(half (&x)[8], const half* addr) +{ + half2 y[4]; + asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];" + : "=r"(*reinterpret_cast(y)), + "=r"(*reinterpret_cast(y + 1)), + "=r"(*reinterpret_cast(y + 2)), + "=r"(*reinterpret_cast(y + 3)) + : "l"(reinterpret_cast(addr))); + x[0] = y[0].x; + x[1] = y[0].y; + x[2] = y[1].x; + x[3] = y[1].y; + x[4] = y[2].x; + x[5] = y[2].y; + x[6] = y[3].x; + x[7] = y[3].y; +} DI void ldg(double& x, const double* addr) { asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py index 3bf2cba8f2..6f8766c86b 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py @@ -90,9 +90,11 @@ "uint32_t", "float", ), # data_t, vec_idx_t, distance_t + half_uint32=("half", "uint32_t", "float"), int8_uint32=("int8_t", "uint32_t", "float"), uint8_uint32=("uint8_t", "uint32_t", "float"), float_uint64=("float", "uint64_t", "float"), + half_uint64=("half", "uint64_t", "float"), ) # knn for type_path, (data_t, idx_t, distance_t) in search_types.items(): diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..fa89bca45f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim1024_t32.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 32, 1024, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu new file mode 100644 index 0000000000..645ca61ff5 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim128_t8.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 8, 128, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu new file mode 100644 index 0000000000..41b6f9b420 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim256_t16.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 16, 256, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu new file mode 100644 index 0000000000..38f0ac3b04 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint32_dim512_t32.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 32, 512, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu new file mode 100644 index 0000000000..c462a9d359 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 32, 1024, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu new file mode 100644 index 0000000000..f5b2874e20 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 8, 128, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu new file mode 100644 index 0000000000..0b01428b86 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 16, 256, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu new file mode 100644 index 0000000000..70228a129d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu @@ -0,0 +1,66 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_kernel_selection( + 32, 512, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py index d1d976ee87..1515f43134 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py @@ -93,9 +93,11 @@ search_types = dict( float_uint32=("float", "uint32_t", "float"), # data_t, idx_t, distance_t + half_uint32=("half", "uint32_t", "float"), int8_uint32=("int8_t", "uint32_t", "float"), uint8_uint32=("uint8_t", "uint32_t", "float"), float_uint64=("float", "uint64_t", "float"), + half_uint64=("half", "uint64_t", "float"), ) # knn diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..29e7bfa250 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim1024_t32.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 32, 1024, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu new file mode 100644 index 0000000000..a004f900d0 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim128_t8.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 8, 128, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu new file mode 100644 index 0000000000..549849b21d --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim256_t16.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 16, 256, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu new file mode 100644 index 0000000000..3825f572f7 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint32_dim512_t32.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 32, 512, half, uint32_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu new file mode 100644 index 0000000000..31d83f443b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 32, 1024, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu new file mode 100644 index 0000000000..3493ab294c --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 8, 128, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu new file mode 100644 index 0000000000..6e09709994 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 16, 256, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu new file mode 100644 index 0000000000..4bc0158f7e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu @@ -0,0 +1,67 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T) \ + template void \ + select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + SAMPLE_FILTER_T sample_filter, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run( + 32, 512, half, uint64_t, float, raft::neighbors::filtering::none_cagra_sample_filter); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_half_half_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_half_half_int64_t.cu new file mode 100644 index 0000000000..3c467a12d8 --- /dev/null +++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_half_half_int64_t.cu @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( \ + T, AccT, IdxT, IvfSampleFilterT) \ + template void \ + raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + const uint32_t* coarse_query_results, \ + const uint32_t n_queries, \ + const uint32_t queries_offset, \ + const raft::distance::DistanceType metric, \ + const uint32_t n_probes, \ + const uint32_t k, \ + const bool select_min, \ + IvfSampleFilterT sample_filter, \ + IdxT* neighbors, \ + float* distances, \ + uint32_t& grid_dim_x, \ + rmm::cuda_stream_view stream) + +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + half, half, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); + +#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan diff --git a/cpp/src/neighbors/detail/refine_host_half_float.cpp b/cpp/src/neighbors/detail/refine_host_half_float.cpp new file mode 100644 index 0000000000..d9fb2864fe --- /dev/null +++ b/cpp/src/neighbors/detail/refine_host_half_float.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include + +#define instantiate_raft_neighbors_refine(IdxT, DataT, DistanceT, ExtentsT) \ + template void raft::neighbors::detail::refine_host( \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view queries, \ + raft::host_matrix_view neighbor_candidates, \ + raft::host_matrix_view indices, \ + raft::host_matrix_view distances, \ + distance::DistanceType metric); + +instantiate_raft_neighbors_refine(int64_t, half, float, int64_t); + +#undef instantiate_raft_neighbors_refine diff --git a/cpp/src/neighbors/ivfpq_build_half_int64_t.cu b/cpp/src/neighbors/ivfpq_build_half_int64_t.cu new file mode 100644 index 0000000000..aacb2d7198 --- /dev/null +++ b/cpp/src/neighbors/ivfpq_build_half_int64_t.cu @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include // raft::neighbors::ivf_pq::index + +#include + +#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT) \ + template raft::neighbors::ivf_pq::index raft::neighbors::ivf_pq::build( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::index_params& params, \ + raft::device_matrix_view dataset); \ + \ + template auto raft::neighbors::ivf_pq::build( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::index_params& params, \ + const T* dataset, \ + IdxT n_rows, \ + uint32_t dim) \ + ->raft::neighbors::ivf_pq::index; + +instantiate_raft_neighbors_ivf_pq_build(half, int64_t); + +#undef instantiate_raft_neighbors_ivf_pq_build diff --git a/cpp/src/neighbors/ivfpq_extend_half_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_half_int64_t.cu new file mode 100644 index 0000000000..85477ca4a0 --- /dev/null +++ b/cpp/src/neighbors/ivfpq_extend_half_int64_t.cu @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include // raft::neighbors::ivf_pq::index + +#include + +#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT) \ + template raft::neighbors::ivf_pq::index raft::neighbors::ivf_pq::extend( \ + raft::resources const& handle, \ + raft::device_matrix_view new_vectors, \ + std::optional> new_indices, \ + const raft::neighbors::ivf_pq::index& idx); \ + \ + template void raft::neighbors::ivf_pq::extend( \ + raft::resources const& handle, \ + raft::device_matrix_view new_vectors, \ + std::optional> new_indices, \ + raft::neighbors::ivf_pq::index* idx); \ + \ + template auto raft::neighbors::ivf_pq::extend( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::index& idx, \ + const T* new_vectors, \ + const IdxT* new_indices, \ + IdxT n_rows) \ + ->raft::neighbors::ivf_pq::index; \ + \ + template void raft::neighbors::ivf_pq::extend( \ + raft::resources const& handle, \ + raft::neighbors::ivf_pq::index* idx, \ + const T* new_vectors, \ + const IdxT* new_indices, \ + IdxT n_rows); + +instantiate_raft_neighbors_ivf_pq_extend(half, int64_t); + +#undef instantiate_raft_neighbors_ivf_pq_extend diff --git a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu new file mode 100644 index 0000000000..c9f2e6fdd5 --- /dev/null +++ b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include // raft::neighbors::ivf_pq::index + +#include + +#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT) \ + template void raft::neighbors::ivf_pq::search( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::search_params& params, \ + const raft::neighbors::ivf_pq::index& idx, \ + raft::device_matrix_view queries, \ + raft::device_matrix_view neighbors, \ + raft::device_matrix_view distances); \ + \ + template void raft::neighbors::ivf_pq::search( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::search_params& params, \ + const raft::neighbors::ivf_pq::index& idx, \ + const T* queries, \ + uint32_t n_queries, \ + uint32_t k, \ + IdxT* neighbors, \ + float* distances, \ + rmm::mr::device_memory_resource* mr) + +instantiate_raft_neighbors_ivf_pq_search(half, int64_t); + +#undef instantiate_raft_neighbors_ivf_pq_search diff --git a/cpp/src/neighbors/refine_00_generate.py b/cpp/src/neighbors/refine_00_generate.py index ef51eae437..fd11f4d5c3 100644 --- a/cpp/src/neighbors/refine_00_generate.py +++ b/cpp/src/neighbors/refine_00_generate.py @@ -63,6 +63,7 @@ types = dict( float_float= ("float", "float"), + half_float= ("half", "float"), int8_t_float=("int8_t", "float"), uint8_t_float=("uint8_t", "float"), ) diff --git a/cpp/src/neighbors/refine_half_float.cu b/cpp/src/neighbors/refine_half_float.cu new file mode 100644 index 0000000000..c323951b82 --- /dev/null +++ b/cpp/src/neighbors/refine_half_float.cu @@ -0,0 +1,50 @@ + +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by refine_00_generate.py + * + * Make changes there and run in this directory: + * + * > python refine_00_generate.py + * + */ + +#include + +#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx) \ + template void raft::neighbors::refine( \ + raft::resources const& handle, \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view queries, \ + raft::device_matrix_view neighbor_candidates, \ + raft::device_matrix_view indices, \ + raft::device_matrix_view distances, \ + raft::distance::DistanceType metric); \ + \ + template void raft::neighbors::refine( \ + raft::resources const& handle, \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view queries, \ + raft::host_matrix_view neighbor_candidates, \ + raft::host_matrix_view indices, \ + raft::host_matrix_view distances, \ + raft::distance::DistanceType metric); + +instantiate_raft_neighbors_refine(int64_t, half, float, int64_t); + +#undef instantiate_raft_neighbors_refine diff --git a/cpp/src/raft_runtime/neighbors/cagra_build.cu b/cpp/src/raft_runtime/neighbors/cagra_build.cu index fb139f22a6..5fc472c952 100644 --- a/cpp/src/raft_runtime/neighbors/cagra_build.cu +++ b/cpp/src/raft_runtime/neighbors/cagra_build.cu @@ -19,6 +19,8 @@ #include #include +#include + namespace raft::runtime::neighbors::cagra { #define RAFT_INST_CAGRA_BUILD(T, IdxT) \ @@ -55,6 +57,7 @@ namespace raft::runtime::neighbors::cagra { } RAFT_INST_CAGRA_BUILD(float, uint32_t); +RAFT_INST_CAGRA_BUILD(half, uint32_t); RAFT_INST_CAGRA_BUILD(int8_t, uint32_t); RAFT_INST_CAGRA_BUILD(uint8_t, uint32_t); diff --git a/cpp/src/raft_runtime/neighbors/cagra_search.cu b/cpp/src/raft_runtime/neighbors/cagra_search.cu index 25a249bd76..93244077f4 100644 --- a/cpp/src/raft_runtime/neighbors/cagra_search.cu +++ b/cpp/src/raft_runtime/neighbors/cagra_search.cu @@ -17,6 +17,8 @@ #include #include +#include + namespace raft::runtime::neighbors::cagra { #define RAFT_INST_CAGRA_SEARCH(T, IdxT) \ @@ -31,6 +33,7 @@ namespace raft::runtime::neighbors::cagra { } RAFT_INST_CAGRA_SEARCH(float, uint32_t); +RAFT_INST_CAGRA_SEARCH(half, uint32_t); RAFT_INST_CAGRA_SEARCH(int8_t, uint32_t); RAFT_INST_CAGRA_SEARCH(uint8_t, uint32_t); diff --git a/cpp/src/raft_runtime/neighbors/cagra_serialize.cu b/cpp/src/raft_runtime/neighbors/cagra_serialize.cu index bf8e7bf6ee..f386bcce8e 100644 --- a/cpp/src/raft_runtime/neighbors/cagra_serialize.cu +++ b/cpp/src/raft_runtime/neighbors/cagra_serialize.cu @@ -22,6 +22,8 @@ #include #include +#include + namespace raft::runtime::neighbors::cagra { #define RAFT_INST_CAGRA_SERIALIZE(DTYPE) \ @@ -75,6 +77,7 @@ namespace raft::runtime::neighbors::cagra { } RAFT_INST_CAGRA_SERIALIZE(float); +RAFT_INST_CAGRA_SERIALIZE(half); RAFT_INST_CAGRA_SERIALIZE(int8_t); RAFT_INST_CAGRA_SERIALIZE(uint8_t); diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 931530b66a..fe29409d9b 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -362,9 +362,11 @@ if(BUILD_TESTS) NEIGHBORS_ANN_CAGRA_TEST PATH test/neighbors/ann_cagra/test_float_uint32_t.cu + test/neighbors/ann_cagra/test_half_uint32_t.cu test/neighbors/ann_cagra/test_int8_t_uint32_t.cu test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu test/neighbors/ann_cagra/test_float_int64_t.cu + test/neighbors/ann_cagra/test_half_int64_t.cu src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu @@ -373,6 +375,14 @@ if(BUILD_TESTS) src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu LIB EXPLICIT_INSTANTIATE_ONLY GPUS diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 21aed0b992..915ef8a394 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -40,6 +40,8 @@ #include +#include + #include #include #include @@ -110,42 +112,69 @@ testing::AssertionResult CheckOrder(raft::host_matrix_view index_ return testing::AssertionSuccess(); } +template +struct fpi_mapper {}; + +template <> +struct fpi_mapper { + using type = int64_t; + static constexpr int kBitshiftBase = 53; +}; + +template <> +struct fpi_mapper { + using type = int32_t; + static constexpr int kBitshiftBase = 24; +}; + +template <> +struct fpi_mapper { + using type = int16_t; + static constexpr int kBitshiftBase = 11; +}; + // Generate dataset to ensure no rounding error occurs in the norm computation of any two vectors. // When testing the CAGRA index sorting function, rounding errors can affect the norm and alter the // order of the index. To ensure the accuracy of the test, we utilize the dataset. The generation // method is based on the error-free transformation (EFT) method. -RAFT_KERNEL GenerateRoundingErrorFreeDataset_kernel(float* const ptr, +template +RAFT_KERNEL GenerateRoundingErrorFreeDataset_kernel(T* const ptr, const uint32_t size, - const uint32_t resolution) + const typename fpi_mapper::type resolution) { const auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= size) { return; } - const float u32 = *reinterpret_cast(ptr + tid); + const float u32 = *reinterpret_cast::type*>(ptr + tid); ptr[tid] = u32 / resolution; } +template void GenerateRoundingErrorFreeDataset( const raft::resources& handle, - float* const ptr, + T* const ptr, const uint32_t n_row, const uint32_t dim, raft::random::RngState& rng, const bool diff_flag // true if compute the norm between two vectors ) { + using mapper_type = fpi_mapper; + using int_type = typename mapper_type::type; auto cuda_stream = resource::get_cuda_stream(handle); const uint32_t size = n_row * dim; const uint32_t block_size = 256; const uint32_t grid_size = (size + block_size - 1) / block_size; - const int32_t resolution = - 1 << static_cast(std::floor((24 - std::log2(dim) - (diff_flag ? 1 : 0)) / 2)); - raft::random::uniformInt( - handle, rng, reinterpret_cast(ptr), size, -resolution, resolution - 1); + const auto bitshift = (mapper_type::kBitshiftBase - std::log2(dim) - (diff_flag ? 1 : 0)) / 2; + // Skip the test when `dim` is too big for type `T` to allow rounding error-free test. + if (bitshift <= 1) { GTEST_SKIP(); } + const int_type resolution = int_type{1} << static_cast(std::floor(bitshift)); + raft::random::uniformInt( + handle, rng, reinterpret_cast(ptr), size, -resolution, resolution - 1); - GenerateRoundingErrorFreeDataset_kernel<<>>( - ptr, size, resolution); + GenerateRoundingErrorFreeDataset_kernel + <<>>(ptr, size, resolution); } } // namespace @@ -300,7 +329,7 @@ class AnnCagraTest : public ::testing::TestWithParam { database.resize(((size_t)ps.n_rows) * ps.dim, stream_); search_queries.resize(ps.n_queries * ps.dim, stream_); raft::random::RngState r(1234ULL); - if constexpr (std::is_same{}) { + if constexpr (std::is_same_v || std::is_same_v) { GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, true); GenerateRoundingErrorFreeDataset( handle_, search_queries.data(), ps.n_queries, ps.dim, r, true); @@ -389,7 +418,7 @@ class AnnCagraSortTest : public ::testing::TestWithParam { { database.resize(((size_t)ps.n_rows) * ps.dim, handle_.get_stream()); raft::random::RngState r(1234ULL); - if constexpr (std::is_same{}) { + if constexpr (std::is_same_v || std::is_same_v) { GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, false); } else { raft::random::uniformInt( @@ -656,7 +685,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam { database.resize(((size_t)ps.n_rows) * ps.dim, stream_); search_queries.resize(ps.n_queries * ps.dim, stream_); raft::random::RngState r(1234ULL); - if constexpr (std::is_same{}) { + if constexpr (std::is_same_v || std::is_same_v) { GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, true); GenerateRoundingErrorFreeDataset( handle_, search_queries.data(), ps.n_queries, ps.dim, r, true); diff --git a/cpp/test/neighbors/ann_cagra/test_half_int64_t.cu b/cpp/test/neighbors/ann_cagra/test_half_int64_t.cu new file mode 100644 index 0000000000..fdd510bc5d --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/test_half_int64_t.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../ann_cagra.cuh" +#include "search_kernel_uint64_t.cuh" + +namespace raft::neighbors::cagra { + +typedef AnnCagraTest AnnCagraTestH_I64; +TEST_P(AnnCagraTestH_I64, AnnCagra) { this->testCagra(); } + +INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestH_I64, ::testing::ValuesIn(inputs)); + +} // namespace raft::neighbors::cagra diff --git a/cpp/test/neighbors/ann_cagra/test_half_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_half_uint32_t.cu new file mode 100644 index 0000000000..dea6ae0c23 --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/test_half_uint32_t.cu @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../ann_cagra.cuh" + +namespace raft::neighbors::cagra { + +typedef AnnCagraTest AnnCagraTestH_U32; +TEST_P(AnnCagraTestH_U32, AnnCagra) { this->testCagra(); } + +typedef AnnCagraSortTest AnnCagraSortTestH_U32; +TEST_P(AnnCagraSortTestH_U32, AnnCagraSort) { this->testCagraSort(); } + +typedef AnnCagraFilterTest AnnCagraFilterTestH_U32; +TEST_P(AnnCagraFilterTestH_U32, AnnCagraFilter) +{ + this->testCagraFilter(); + this->testCagraRemoved(); +} + +INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestH_U32, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(AnnCagraSortTest, AnnCagraSortTestH_U32, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(AnnCagraFilterTest, AnnCagraFilterTestH_U32, ::testing::ValuesIn(inputs)); + +} // namespace raft::neighbors::cagra From fe02040d2e9b216c9ca3f44bb39f43094d92e206 Mon Sep 17 00:00:00 2001 From: achirkin Date: Sun, 11 Feb 2024 07:49:35 +0100 Subject: [PATCH 2/2] Fix the shmem size in the ivf-flat scan kernel --- .../raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh index 1cf042c6cd..51cd2876d8 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh @@ -844,7 +844,7 @@ void launch_kernel(Lambda lambda, int smem_size = query_smem_elems * sizeof(T); constexpr int kSubwarpSize = std::min(Capacity, WarpSize); auto block_merge_mem = - raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide( + raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide( kThreadsPerBlock / kSubwarpSize, k); smem_size += std::max(smem_size, block_merge_mem);