From d6df557eeb8388c74bc0135c0a32681f887b8bb8 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Fri, 18 Nov 2022 18:00:59 +0100 Subject: [PATCH] IVF-PQ: use device properties helper (#1035) Use raft handle's lazy-loading helper `get_device_properties` instead of explicitly calling `cudaGetDeviceProperties` on every kernel launch, which is a costly operation. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/raft/pull/1035 --- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 0ff659ae5d..c1a3682f47 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -1019,7 +1019,8 @@ struct ivfpq_compute_similarity { * beyond this limit do not consider increasing the number of active blocks per SM * would improve locality anymore. */ - static inline auto select(bool manage_local_topk, + static inline auto select(const cudaDeviceProp& dev_props, + bool manage_local_topk, int locality_hint, double preferred_shmem_carveout, uint32_t pq_bits, @@ -1029,12 +1030,6 @@ struct ivfpq_compute_similarity { uint32_t n_probes, uint32_t topk) -> selected { - cudaDeviceProp dev_props; - { - int cur_dev; - RAFT_CUDA_TRY(cudaGetDevice(&cur_dev)); - RAFT_CUDA_TRY(cudaGetDeviceProperties(&dev_props, cur_dev)); - } // Shared memory for storing the lookup table size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits); // Shared memory for storing pre-computed pieces to speedup the lookup table construction @@ -1364,7 +1359,8 @@ void ivfpq_search_worker(const handle_t& handle, } auto search_instance = - ivfpq_compute_similarity::select(manage_local_topk, + ivfpq_compute_similarity::select(handle.get_device_properties(), + manage_local_topk, coresidency, preferred_shmem_carveout, index.pq_bits(),