From e272176e4d674c4a10c09cadacc6023f8ab66122 Mon Sep 17 00:00:00 2001
From: Rui Lan <abc99lr@gmail.com>
Date: Wed, 24 Jan 2024 21:43:48 -0800
Subject: [PATCH] Subsampling for IVF-PQ codebook generation (#2052)

This PR address https://github.com/rapidsai/raft/issues/1901 by subsampling the input dataset for PQ codebook training to reduce the runtime.

Currently, a similar strategy is applied to `per_cluster` method, but not to the default `per_subset` method. This PR fixes this gap. Similar to the subsampling mechanism of the `per_cluster` method, we pick at minimum `256*max(pq_book_size, pq_dim)` number of input rows for training each code book.

https://github.com/rapidsai/raft/blob/cf4e03d0b952c1baac73f695f94d6482d8c391d8/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh#L408

The following performance numbers are generated using Deep-100M dataset. After subsampling, the search time and accuracy are not impacted (within +-5%) except one case where I saw 9% performance drop on search (using 10K batch for search). More extensive benchmarking across datasets seems to be needed for justification.

Dataset | n_iter | n_list | pq_bits | pq_dim | ratio | Original time (s) | Subsampling (s) | Speedup [subsampling]
-- | -- | -- | -- | -- | -- | -- | -- | --
Deep-100M | 25 | 50000 | 4 | 96 | 10 | 129 | 89.5 | 1.44
Deep-100M | 25 | 50000 | 5 | 96 | 10 | 128 | 89.4 | 1.43
Deep-100M | 25 | 50000 | 6 | 96 | 10 | 131 | 90 | 1.46
Deep-100M | 25 | 50000 | 7 | 96 | 10 | 129 | 91.1 | 1.42
Deep-100M | 25 | 50000 | 8 | 96 | 10 | 149 | 93.4 | 1.60

Note, after subsampling, the PQ codebook generation is no longer a bottleneck in the IVF-PQ index building. More optimizations on PQ codebook generation seem unnecessary. Although we could in theory apply the custom kernel approach (https://github.com/rapidsai/raft/pull/2050)
 with subsampling, my early tests show the current GEMM approach performs better than the custom kernel after subsampling.

Using multiple stream could improve the performance further by overlapping kernels for different `pq_dim`, given kernels are small after subsampling and may not fully utilize GPU. However, as mention above, since the entire PQ codebook is fast, this optimization may not be worthwhile.

TODO

- [x] Benchmark the performance/accuracy impacts on multiple datasets

Authors:
  - Rui Lan (https://github.com/abc99lr)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2052
---
 .../src/raft/raft_ann_bench_param_parser.h    |  5 +++-
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 29 +++++++++++++------
 cpp/include/raft/neighbors/ivf_pq_types.hpp   | 10 ++++++-
 docs/source/ann_benchmarks_param_tuning.md    |  1 +
 .../neighbors/ivf_pq/cpp/c_ivf_pq.pxd         |  3 +-
 .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx     | 18 ++++++++++--
 6 files changed, 52 insertions(+), 14 deletions(-)
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 2a021a8a12..ae40deb50c 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,6 +87,9 @@ void parse_build_param(const nlohmann::json& conf,
                                "', should be either 'cluster' or 'subspace'");
     }
   }
+  if (conf.contains("max_train_points_per_pq_code")) {
+    param.max_train_points_per_pq_code = conf.at("max_train_points_per_pq_code");
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index cc94511fe7..0ef6cb13fb 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -353,14 +353,19 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
+                      uint32_t max_train_points_per_pq_code,
                       rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
+  // Subsampling the train set for codebook generation based on max_train_points_per_pq_code.
+  size_t big_enough = max_train_points_per_pq_code * size_t(index.pq_book_size());
+  auto pq_n_rows    = uint32_t(std::min(big_enough, n_rows));
+  rmm::device_uvector<float> sub_trainset(
+    pq_n_rows * size_t(index.pq_len()), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
@@ -371,7 +376,7 @@ void train_per_subset(raft::resources const& handle,
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
     utils::copy_selected<float, float, size_t, uint32_t>(
-      n_rows,
+      pq_n_rows,
       index.pq_len(),
       index.centers_rot().data_handle() + index.pq_len() * j,
       labels,
@@ -387,7 +392,7 @@ void train_per_subset(raft::resources const& handle,
                  true,
                  false,
                  index.pq_len(),
-                 n_rows,
+                 pq_n_rows,
                  index.dim(),
                  &alpha,
                  index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j,
@@ -400,13 +405,14 @@ void train_per_subset(raft::resources const& handle,
                  stream);
 
     // train PQ codebook for this subspace
-    auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
+    auto sub_trainset_view = raft::make_device_matrix_view<const float, IdxT>(
+      sub_trainset.data(), pq_n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
+    auto sub_labels_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
     auto cluster_sizes_view =
       raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
@@ -430,6 +436,7 @@ void train_per_cluster(raft::resources const& handle,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
+                       uint32_t max_train_points_per_pq_code,
                        rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
@@ -477,9 +484,11 @@ void train_per_cluster(raft::resources const& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
-    // limit the cluster size to bound the training time.
+    // limit the cluster size to bound the training time based on max_train_points_per_pq_code
+    // If pq_book_size is less than pq_dim, use max_train_points_per_pq_code per pq_dim instead
     // [sic] we interpret the data as pq_len-dimensional
-    size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+    size_t big_enough =
+      max_train_points_per_pq_code * std::max<size_t>(index.pq_book_size(), index.pq_dim());
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
@@ -1788,6 +1797,7 @@ auto build(raft::resources const& handle,
                          trainset.data_handle(),
                          labels.data(),
                          params.kmeans_n_iters,
+                         params.max_train_points_per_pq_code,
                          &managed_mr);
         break;
       case codebook_gen::PER_CLUSTER:
@@ -1797,6 +1807,7 @@ auto build(raft::resources const& handle,
                           trainset.data_handle(),
                           labels.data(),
                           params.kmeans_n_iters,
+                          params.max_train_points_per_pq_code,
                           &managed_mr);
         break;
       default: RAFT_FAIL("Unreachable code");
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 45ab18c84f..04c2354fbc 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,6 +105,14 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
+  /**
+   * The max number of data points to use per PQ code during PQ codebook training. Using more data
+   * points per PQ code may increase the quality of PQ codebook but may also increase the build
+   * time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
+   * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
+   * points to train each codebook.
+   */
+  uint32_t max_train_points_per_pq_code = 256;
 };
 
 struct search_params : ann::search_params {
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index afb4ed18ea..e003aa879c 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -38,6 +38,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `pq_bits`              | `build`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
 | `codebook_kind`        | `build`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
 | `dataset_memory_type`  | `build` | N | ["device", "host", "mmap"]       | "host" | What memory type should the dataset reside?                                                                                                                                       |
+| `max_train_points_per_pq_code`         | `build` | N | Positive Number >=1              | 256       | Max number of data points per PQ code used for PQ code book creation. Depending on input dataset size, the data points could be less than what user specifies.         |
 | `query_memory_type`    | `search` | N | ["device", "host", "mmap"]       | "device | What memory type should the queries reside? |
 | `nprobe`               | `search` | Y | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
 | `internalDistanceDtype` | `search` | N | [`float`, `half`]                | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 531c2428e9..930c3245f1 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,6 +78,7 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         codebook_gen codebook_kind
         bool force_random_rotation
         bool conservative_memory_allocation
+        uint32_t max_train_points_per_pq_code
 
     cdef cppclass index[IdxT](ann_index):
         index(const device_resources& handle,
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 0c1bbf6b9c..7081b65ce3 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -156,6 +156,14 @@ cdef class IndexParams:
         repeated calls to `extend` (extending the database).
         To disable this behavior and use as little GPU memory for the
         database as possible, set this flat to `True`.
+    max_train_points_per_pq_code : int, default = 256
+        The max number of data points to use per PQ code during PQ codebook
+        training. Using more data points per PQ code may increase the
+        quality of PQ codebook but may also increase the build time. The
+        parameter is applied to both PQ codebook generation methods, i.e.,
+        PER_SUBSPACE and PER_CLUSTER. In both cases, we will use
+        pq_book_size * max_train_points_per_pq_code training points to
+        train each codebook.
     """
     def __init__(self, *,
                  n_lists=1024,
@@ -167,7 +175,8 @@ cdef class IndexParams:
                  codebook_kind="subspace",
                  force_random_rotation=False,
                  add_data_on_build=True,
-                 conservative_memory_allocation=False):
+                 conservative_memory_allocation=False,
+                 max_train_points_per_pq_code=256):
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -185,6 +194,8 @@ cdef class IndexParams:
         self.params.add_data_on_build = add_data_on_build
         self.params.conservative_memory_allocation = \
             conservative_memory_allocation
+        self.params.max_train_points_per_pq_code = \
+            max_train_points_per_pq_code
 
     @property
     def n_lists(self):
@@ -226,6 +237,9 @@ cdef class IndexParams:
     def conservative_memory_allocation(self):
         return self.params.conservative_memory_allocation
 
+    @property
+    def max_train_points_per_pq_code(self):
+        return self.params.max_train_points_per_pq_code
 
 cdef class Index:
     # We store a pointer to the index because it dose not have a trivial