Fix max_queries for CAGRA (#2081)

Fix for #2072: CAGRA search is launching a thread per query in single-CTA. The maximum number of thread is 65535 so the `max_queries` auto selection should be bounded to this number. Authors: - Micka (https://github.com/lowener) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #2081
rapidsai · Jan 9, 2024 · 1484a03 · 1484a03
1 parent 3b88d17
commit 1484a03
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 24 deletions.
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,7 +118,10 @@ void search_main(raft::resources const& res,
   RAFT_EXPECTS(queries.extent(1) == index.dim(), "Queries and index dim must match");
   const uint32_t topk = neighbors.extent(1);
 
-  if (params.max_queries == 0) { params.max_queries = queries.extent(0); }
+  cudaDeviceProp deviceProp = resource::get_device_properties(res);
+  if (params.max_queries == 0) {
+    params.max_queries = std::min<size_t>(queries.extent(0), deviceProp.maxGridSize[1]);
+  }
 
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, index.dim());

diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,7 +147,7 @@ struct search_plan_impl : public search_plan_impl_base {
   // defines hash_bitlen, small_hash_bitlen, small_hash_reset interval, hash_size
   inline void calc_hashmap_params(raft::resources const& res)
   {
-    // for multipel CTA search
+    // for multiple CTA search
     uint32_t mc_num_cta_per_query = 0;
     uint32_t mc_search_width      = 0;
     uint32_t mc_itopk_size        = 0;

diff --git a/notebooks/utils.py b/notebooks/utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ def benchmark_runs(self):
                 self.timings.append(t1 - t0)
 
 
-def load_dataset(dataset_url, work_folder=None):
+def load_dataset(dataset_url="http://ann-benchmarks.com/sift-128-euclidean.hdf5", work_folder=None):
     """Download dataset from url. It is expected that the dataset contains a hdf5 file in ann-benchmarks format
 
     Parameters
@@ -82,7 +82,6 @@ def load_dataset(dataset_url, work_folder=None):
       work_folder name of the local folder to store the dataset
 
     """
-    dataset_url = "http://ann-benchmarks.com/sift-128-euclidean.hdf5"
     dataset_filename = dataset_url.split("/")[-1]
 
     # We'll need to load store some data in this tutorial

diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ from pylibraft.neighbors.common cimport _get_metric_string
 
 
 cdef class IndexParams:
-    """"
+    """
     Parameters to build index for CAGRA nearest neighbor search
 
     Parameters
@@ -104,13 +104,13 @@ cdef class IndexParams:
 
     graph_degree : int, default = 64
 
-    build_algo: string denoting the graph building algorithm to use,
+    build_algo: string denoting the graph building algorithm to use, \
                 default = "ivf_pq"
         Valid values for algo: ["ivf_pq", "nn_descent"], where
-        - ivf_pq will use the IVF-PQ algorithm for building the knn graph
-        - nn_descent (experimental) will use the NN-Descent algorithm for
-          building the knn graph. It is expected to be generally
-          faster than ivf_pq.
+            - ivf_pq will use the IVF-PQ algorithm for building the knn graph
+            - nn_descent (experimental) will use the NN-Descent algorithm for
+              building the knn graph. It is expected to be generally
+              faster than ivf_pq.
     """
     cdef c_cagra.index_params params
 
@@ -501,10 +501,10 @@ cdef class SearchParams:
         Upper limit of search iterations. Auto select when 0.
     algo: string denoting the search algorithm to use, default = "auto"
         Valid values for algo: ["auto", "single_cta", "multi_cta"], where
-        - auto will automatically select the best value based on query size
-        - single_cta is better when query contains larger number of
-        vectors (e.g >10)
-        - multi_cta is better when query contains only a few vectors
+            - auto will automatically select the best value based on query size
+            - single_cta is better when query contains larger number of
+              vectors (e.g >10)
+            - multi_cta is better when query contains only a few vectors
     team_size: int, default = 0
         Number of threads used to calculate a single distance. 4, 8, 16,
         or 32.
@@ -516,13 +516,13 @@ cdef class SearchParams:
     thread_block_size: int, default = 0
         Thread block size. 0, 64, 128, 256, 512, 1024.
         Auto selection when 0.
-    hashmap_mode: string denoting the type of hash map to use. It's
-        usually better to allow the algorithm to select this value.,
-        default = "auto"
+    hashmap_mode: string denoting the type of hash map to use.
+        It's usually better to allow the algorithm to select this value,
+        default = "auto".
         Valid values for hashmap_mode: ["auto", "small", "hash"], where
-        - auto will automatically select the best value based on algo
-        - small will use the small shared memory hash table with resetting.
-        - hash will use a single hash table in global memory.
+            - auto will automatically select the best value based on algo
+            - small will use the small shared memory hash table with resetting.
+            - hash will use a single hash table in global memory.
     hashmap_min_bitlen: int, default = 0
         Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
     hashmap_max_fill_rate: float, default = 0.5