rapidsai · rapids-bot · Jan 25, 2024 · Dec 8, 2023 · Jan 6, 2024 · Jan 13, 2024
@@ -86,6 +86,8 @@ void parse_build_param(const nlohmann::json& conf,
       throw std::runtime_error("codebook_kind: '" + kind +
                                "', should be either 'cluster' or 'subspace'");
     }
+    if (conf.contains("pq_codebook_ratio")) { 
+      param.pq_codebook_trainset_fraction = 1.0 / (double)conf.at("pq_codebook_ratio"); }
   }
 }
 

@@ -395,14 +395,17 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
+                      double pq_codebook_trainset_fraction,
                       rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
+  // Subsampling the train set for codebook generation based on pq_codebook_trainset_fraction. 
+  auto pq_n_rows     = uint32_t(n_rows * pq_codebook_trainset_fraction);
+  rmm::device_uvector<float> sub_trainset(pq_n_rows * size_t(index.pq_len()), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
@@ -413,7 +416,7 @@ void train_per_subset(raft::resources const& handle,
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
     utils::copy_selected<float, float, size_t, uint32_t>(
-      n_rows,
+      pq_n_rows,
       index.pq_len(),
       index.centers_rot().data_handle() + index.pq_len() * j,
       labels,
@@ -429,7 +432,7 @@ void train_per_subset(raft::resources const& handle,
                  true,
                  false,
                  index.pq_len(),
-                 n_rows,
+                 pq_n_rows,
                  index.dim(),
                  &alpha,
                  index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j,
@@ -443,12 +446,12 @@ void train_per_subset(raft::resources const& handle,
 
     // train PQ codebook for this subspace
     auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
+      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), pq_n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
+    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
     auto cluster_sizes_view =
       raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
@@ -1858,6 +1861,7 @@ auto build(raft::resources const& handle,
                          trainset.data(),
                          labels.data(),
                          params.kmeans_n_iters,
+                         params.pq_codebook_trainset_fraction,
                          &managed_memory_upstream);
         break;
       case codebook_gen::PER_CLUSTER:

@@ -105,6 +105,12 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
+  /** 
+   * The fraction of data to use during PQ codebook generation on top of the subsampled data 
+   * controlled by kmeans_trainset_fraction. The parameter is only used when PQ codebook generation 
+   * kind is PER_SUBSPACE and ignored when PQ codebook generation kind is PER_CLUSTER. 
+  */
+  double pq_codebook_trainset_fraction = 1;
 };
 
 struct search_params : ann::search_params {

@@ -78,7 +78,8 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         codebook_gen codebook_kind
         bool force_random_rotation
         bool conservative_memory_allocation
-
+        double pq_codebook_trainset_fraction
+
     cdef cppclass index[IdxT](ann_index):
         index(const device_resources& handle,
               DistanceType metric,

@@ -156,6 +156,13 @@ cdef class IndexParams:
         repeated calls to `extend` (extending the database).
         To disable this behavior and use as little GPU memory for the
         database as possible, set this flat to `True`.
+    pq_codebook_trainset_fraction : int, default = 0.5 
+        If pq_codebook_trainset_fraction is less than 1, then the dataset is
+        subsampled for PQ codebook generation, and only n_samples * 
+        pq_codebook_trainset_fraction rows are used for PQ codebook generation.
+        This subsampling is applied after kmeans_trainset subsampling, 
+        controlled by kmeans_trainset_fraction and only used when codebook_kind
+        is PER_SUBSPACE. 
     """
     def __init__(self, *,
                  n_lists=1024,
@@ -167,7 +174,8 @@ cdef class IndexParams:
                  codebook_kind="subspace",
                  force_random_rotation=False,
                  add_data_on_build=True,
-                 conservative_memory_allocation=False):
+                 conservative_memory_allocation=False,
+                 pq_codebook_trainset_fraction=0.5):
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -185,6 +193,8 @@ cdef class IndexParams:
         self.params.add_data_on_build = add_data_on_build
         self.params.conservative_memory_allocation = \
             conservative_memory_allocation
+        self.params.pq_codebook_trainset_fraction = \
+            pq_codebook_trainset_fraction
 
     @property
     def n_lists(self):
@@ -226,6 +236,9 @@ cdef class IndexParams:
     def conservative_memory_allocation(self):
         return self.params.conservative_memory_allocation
 
+    @property
+        def pq_codebook_trainset_fraction(self):
+            return self.params.pq_codebook_trainset_fraction
 
 cdef class Index:
     # We store a pointer to the index because it dose not have a trivial