From 0a2f023f6cc451fa29e5d103996bafbbb667c3a4 Mon Sep 17 00:00:00 2001 From: Artem Chirkin <9253178+achirkin@users.noreply.github.com> Date: Mon, 5 Dec 2022 05:06:43 -0800 Subject: [PATCH 1/3] Tweak the launch cofiguration in some non-optimal cases. --- cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index c1a3682f47..d129cde1e0 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -1173,7 +1173,15 @@ struct ivfpq_compute_similarity { // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm), // the locality is not going to improve with increasing the number of blocks per SM. // Hence, the only metric here is the occupancy. - select_it = tmp.occupancy > cur.occupancy; + bool improves_occupancy = tmp.occupancy > cur.occupancy; + // Otherwise, the performance still improves with a smaller block size, + // given there are enough resources + bool improves_parallelism = + tmp.occupancy == cur.occupancy // same occupancy + && n_threads_tmp >= 2 * n_threads_min // surely enough blocks + && // doesn't use too much shmem + tmp.shmem_use <= std::min(0.5, double(max_carveout) / 100.0); + select_it = improves_occupancy || improves_parallelism; } else { // If we don't use shared memory for the lookup table, increasing the number of blocks // is very taxing on the global memory usage. From ced1e7f70d48f662d8a9016317067946e1c67218 Mon Sep 17 00:00:00 2001 From: Artem Chirkin <9253178+achirkin@users.noreply.github.com> Date: Thu, 8 Dec 2022 00:33:49 -0800 Subject: [PATCH 2/3] Simplify the selection condition --- cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index d129cde1e0..3991c1a4a3 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -1175,12 +1175,10 @@ struct ivfpq_compute_similarity { // Hence, the only metric here is the occupancy. bool improves_occupancy = tmp.occupancy > cur.occupancy; // Otherwise, the performance still improves with a smaller block size, - // given there are enough resources + // given there is enough work to do bool improves_parallelism = - tmp.occupancy == cur.occupancy // same occupancy - && n_threads_tmp >= 2 * n_threads_min // surely enough blocks - && // doesn't use too much shmem - tmp.shmem_use <= std::min(0.5, double(max_carveout) / 100.0); + tmp.occupancy == cur.occupancy && + 7 * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks; select_it = improves_occupancy || improves_parallelism; } else { // If we don't use shared memory for the lookup table, increasing the number of blocks From 4706f8199cfbb4bb50beaaffe8f9041586fc76da Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 8 Dec 2022 11:17:38 +0100 Subject: [PATCH 3/3] Fix a typo (different signedness comparison warning) --- cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 3991c1a4a3..cd8197bcf0 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -1178,7 +1178,7 @@ struct ivfpq_compute_similarity { // given there is enough work to do bool improves_parallelism = tmp.occupancy == cur.occupancy && - 7 * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks; + 7u * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks; select_it = improves_occupancy || improves_parallelism; } else { // If we don't use shared memory for the lookup table, increasing the number of blocks