diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
index 176309c8ce..3fb7b3005b 100644
--- a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
+++ b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -126,7 +126,8 @@ inline dtype_t get_numpy_dtype()
 }
 
 #if defined(_RAFT_HAS_CUDA)
-template <typename T, typename std::enable_if_t<std::is_same_v<T, half>, bool> = true>
+template <typename T,
+          typename std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, half>, bool> = true>
 inline dtype_t get_numpy_dtype()
 {
   return {RAFT_NUMPY_HOST_ENDIAN_CHAR, 'e', sizeof(T)};
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index b8258297e6..b7e362f704 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -25,6 +25,7 @@
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/dataset.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -279,62 +280,6 @@ index<T, IdxT> build(raft::resources const& res,
   return detail::build<T, IdxT, Accessor>(res, params, dataset);
 }
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [cagra::build](#cagra::build) documentation for a usage example.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] res raft resources
- * @param[in] params configure the search
- * @param[in] idx cagra index
- * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
- * k]
- */
-template <typename T, typename IdxT>
-void search(raft::resources const& res,
-            const search_params& params,
-            const index<T, IdxT>& idx,
-            raft::device_matrix_view<const T, int64_t, row_major> queries,
-            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
-            raft::device_matrix_view<float, int64_t, row_major> distances)
-{
-  RAFT_EXPECTS(
-    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
-    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
-
-  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
-               "Number of columns in output neighbors and distances matrices must equal k");
-  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
-               "Number of query dimensions should equal number of dimensions in the index.");
-
-  using internal_IdxT   = typename std::make_unsigned<IdxT>::type;
-  auto queries_internal = raft::make_device_matrix_view<const T, int64_t, row_major>(
-    queries.data_handle(), queries.extent(0), queries.extent(1));
-  auto neighbors_internal = raft::make_device_matrix_view<internal_IdxT, int64_t, row_major>(
-    reinterpret_cast<internal_IdxT*>(neighbors.data_handle()),
-    neighbors.extent(0),
-    neighbors.extent(1));
-  auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
-    distances.data_handle(), distances.extent(0), distances.extent(1));
-
-  cagra::detail::search_main<T,
-                             internal_IdxT,
-                             decltype(raft::neighbors::filtering::none_cagra_sample_filter()),
-                             IdxT>(res,
-                                   params,
-                                   idx,
-                                   queries_internal,
-                                   neighbors_internal,
-                                   distances_internal,
-                                   raft::neighbors::filtering::none_cagra_sample_filter());
-}
-
 /**
  * @brief Search ANN using the constructed index with the given sample filter.
  *
@@ -401,10 +346,40 @@ void search_with_filtering(raft::resources const& res,
   auto distances_internal = raft::make_device_matrix_view<float, int64_t, row_major>(
     distances.data_handle(), distances.extent(0), distances.extent(1));
 
-  cagra::detail::search_main<T, internal_IdxT, CagraSampleFilterT, IdxT>(
+  return cagra::detail::search_main<T, internal_IdxT, CagraSampleFilterT, IdxT>(
     res, params, idx, queries_internal, neighbors_internal, distances_internal, sample_filter);
 }
 
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [cagra::build](#cagra::build) documentation for a usage example.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the search
+ * @param[in] idx cagra index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::resources const& res,
+            const search_params& params,
+            const index<T, IdxT>& idx,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances)
+{
+  using none_filter_type = raft::neighbors::filtering::none_cagra_sample_filter;
+  return cagra::search_with_filtering<T, IdxT, none_filter_type>(
+    res, params, idx, queries, neighbors, distances, none_filter_type{});
+}
+
 /** @} */  // end group cagra
 
 }  // namespace raft::neighbors::cagra
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 0f574ae5bb..807f89fd65 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "ann_types.hpp"
+#include "dataset.hpp"
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/error.hpp>
@@ -35,6 +36,7 @@
 #include <optional>
 #include <string>
 #include <type_traits>
+
 namespace raft::neighbors::cagra {
 /**
  * @addtogroup cagra
@@ -61,6 +63,12 @@ struct index_params : ann::index_params {
   graph_build_algo build_algo = graph_build_algo::IVF_PQ;
   /** Number of Iterations to run if building with NN_DESCENT */
   size_t nn_descent_niter = 20;
+  /**
+   * Specify compression params if compression is desired.
+   *
+   * NOTE: this is experimental new API, consider it unsafe.
+   */
+  std::optional<vpq_params> compression = std::nullopt;
 };
 
 enum class search_algo {
@@ -145,25 +153,37 @@ struct index : ann::index {
   /** Total length of the index (number of vectors). */
   [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT
   {
-    return dataset_view_.extent(0) ? dataset_view_.extent(0) : graph_view_.extent(0);
+    auto data_rows = dataset_->n_rows();
+    return data_rows > 0 ? data_rows : graph_view_.extent(0);
   }
 
   /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
-  {
-    return dataset_view_.extent(1);
-  }
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { return dataset_->dim(); }
   /** Graph degree */
   [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t
   {
     return graph_view_.extent(1);
   }
 
-  /** Dataset [size, dim] */
-  [[nodiscard]] inline auto dataset() const noexcept
+  /**
+   * DEPRECATED: please use data() instead.
+   *   If you need to query dataset dimensions, use the dim() and size() of the cagra index.
+   *   The data_handle() is not always available: you need to do a dynamic_cast to the expected
+   *   dataset type at runtime.
+   */
+  [[nodiscard]] [[deprecated("Use data()")]] inline auto dataset() const noexcept
     -> device_matrix_view<const T, int64_t, layout_stride>
   {
-    return dataset_view_;
+    auto p = dynamic_cast<strided_dataset<T, int64_t>*>(dataset_.get());
+    if (p != nullptr) { return p->view(); }
+    auto d = dataset_->dim();
+    return make_device_strided_matrix_view<const T, int64_t>(nullptr, 0, d, d);
+  }
+
+  /** Dataset [size, dim] */
+  [[nodiscard]] inline auto data() const noexcept -> const neighbors::dataset<int64_t>&
+  {
+    return *dataset_;
   }
 
   /** neighborhood graph [size, graph-degree] */
@@ -185,7 +205,7 @@ struct index : ann::index {
         raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
       metric_(metric),
-      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      dataset_(new neighbors::empty_dataset<int64_t>(0)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
   }
@@ -251,12 +271,11 @@ struct index : ann::index {
         mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
     : ann::index(),
       metric_(metric),
-      dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
+      dataset_(make_aligned_dataset(res, dataset, 16)),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
   {
     RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
                  "Dataset and knn_graph must have equal number of rows");
-    update_dataset(res, dataset);
     update_graph(res, knn_graph);
     resource::sync_stream(res);
   }
@@ -271,21 +290,14 @@ struct index : ann::index {
   void update_dataset(raft::resources const& res,
                       raft::device_matrix_view<const T, int64_t, row_major> dataset)
   {
-    if (dataset.extent(1) * sizeof(T) % 16 != 0) {
-      RAFT_LOG_DEBUG("Creating a padded copy of CAGRA dataset in device memory");
-      copy_padded(res, dataset);
-    } else {
-      dataset_view_ = make_device_strided_matrix_view<const T, int64_t>(
-        dataset.data_handle(), dataset.extent(0), dataset.extent(1), dataset.extent(1));
-    }
+    dataset_ = make_aligned_dataset(res, dataset, 16);
   }
 
   /** Set the dataset reference explicitly to a device matrix view with padding. */
-  void update_dataset(raft::resources const&,
+  void update_dataset(raft::resources const& res,
                       raft::device_matrix_view<const T, int64_t, layout_stride> dataset)
   {
-    RAFT_EXPECTS(dataset.stride(0) * sizeof(T) % 16 == 0, "Incorrect data padding.");
-    dataset_view_ = dataset;
+    dataset_ = make_aligned_dataset(res, dataset, 16);
   }
 
   /**
@@ -296,8 +308,22 @@ struct index : ann::index {
   void update_dataset(raft::resources const& res,
                       raft::host_matrix_view<const T, int64_t, row_major> dataset)
   {
-    RAFT_LOG_DEBUG("Copying CAGRA dataset from host to device");
-    copy_padded(res, dataset);
+    dataset_ = make_aligned_dataset(res, dataset, 16);
+  }
+
+  /** Replace the dataset with a new dataset. */
+  template <typename DatasetT>
+  auto update_dataset(raft::resources const& res, DatasetT&& dataset)
+    -> std::enable_if_t<std::is_base_of_v<neighbors::dataset<int64_t>, DatasetT>>
+  {
+    dataset_ = std::make_unique<DatasetT>(std::move(dataset));
+  }
+
+  template <typename DatasetT>
+  auto update_dataset(raft::resources const& res, std::unique_ptr<DatasetT>&& dataset)
+    -> std::enable_if_t<std::is_base_of_v<neighbors::dataset<int64_t>, DatasetT>>
+  {
+    dataset_ = std::move(dataset);
   }
 
   /**
@@ -334,26 +360,10 @@ struct index : ann::index {
   }
 
  private:
-  /** Create a device copy of the dataset, and pad it if necessary. */
-  template <typename data_accessor>
-  void copy_padded(raft::resources const& res,
-                   mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset)
-  {
-    detail::copy_with_padding(res, dataset_, dataset);
-
-    dataset_view_ = make_device_strided_matrix_view<const T, int64_t>(
-      dataset_.data_handle(), dataset_.extent(0), dataset.extent(1), dataset_.extent(1));
-    RAFT_LOG_DEBUG("CAGRA dataset strided matrix view %zux%zu, stride %zu",
-                   static_cast<size_t>(dataset_view_.extent(0)),
-                   static_cast<size_t>(dataset_view_.extent(1)),
-                   static_cast<size_t>(dataset_view_.stride(0)));
-  }
-
   raft::distance::DistanceType metric_;
-  raft::device_matrix<T, int64_t, row_major> dataset_;
   raft::device_matrix<IdxT, int64_t, row_major> graph_;
-  raft::device_matrix_view<const T, int64_t, layout_stride> dataset_view_;
   raft::device_matrix_view<const IdxT, int64_t, row_major> graph_view_;
+  std::unique_ptr<neighbors::dataset<int64_t>> dataset_;
 };
 
 /** @} */
diff --git a/cpp/include/raft/neighbors/dataset.hpp b/cpp/include/raft/neighbors/dataset.hpp
new file mode 100644
index 0000000000..e7a3ba97a4
--- /dev/null
+++ b/cpp/include/raft/neighbors/dataset.hpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>   // get_device_for_address
+#include <raft/util/integer_utils.hpp>  // rounding up
+
+#include <memory>
+#include <numeric>
+#include <type_traits>
+
+#ifdef __cpp_lib_bitops
+#include <bit>
+#endif
+
+namespace raft::neighbors {
+
+/** Two-dimensional dataset; maybe owning, maybe compressed, maybe strided. */
+template <typename IdxT>
+struct dataset {
+  using index_type = IdxT;
+  /**  Size of the dataset. */
+  [[nodiscard]] virtual auto n_rows() const noexcept -> index_type = 0;
+  /** Dimensionality of the dataset. */
+  [[nodiscard]] virtual auto dim() const noexcept -> uint32_t = 0;
+  /** Whether the object owns the data. */
+  [[nodiscard]] virtual auto is_owning() const noexcept -> bool = 0;
+  virtual ~dataset() noexcept                                   = default;
+};
+
+template <typename IdxT>
+struct empty_dataset : public dataset<IdxT> {
+  using index_type = IdxT;
+  uint32_t suggested_dim;
+  explicit empty_dataset(uint32_t dim) noexcept : suggested_dim(dim) {}
+  [[nodiscard]] auto n_rows() const noexcept -> index_type final { return 0; }
+  [[nodiscard]] auto dim() const noexcept -> uint32_t final { return suggested_dim; }
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
+};
+
+template <typename DataT, typename IdxT>
+struct strided_dataset : public dataset<IdxT> {
+  using index_type = IdxT;
+  using value_type = DataT;
+  using view_type  = device_matrix_view<const value_type, index_type, layout_stride>;
+  [[nodiscard]] auto n_rows() const noexcept -> index_type final { return view().extent(0); }
+  [[nodiscard]] auto dim() const noexcept -> uint32_t final
+  {
+    return static_cast<uint32_t>(view().extent(1));
+  }
+  /** Leading dimension of the dataset. */
+  [[nodiscard]] constexpr auto stride() const noexcept -> uint32_t
+  {
+    auto v = view();
+    return static_cast<uint32_t>(v.stride(0) > 0 ? v.stride(0) : v.extent(1));
+  }
+  /** Get the view of the data. */
+  [[nodiscard]] virtual auto view() const noexcept -> view_type;
+};
+
+template <typename DataT, typename IdxT>
+struct non_owning_dataset : public strided_dataset<DataT, IdxT> {
+  using index_type = IdxT;
+  using value_type = DataT;
+  using typename strided_dataset<value_type, index_type>::view_type;
+  view_type data;
+  explicit non_owning_dataset(view_type v) noexcept : data(v) {}
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return false; }
+  [[nodiscard]] auto view() const noexcept -> view_type final { return data; };
+};
+
+template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy>
+struct owning_dataset : public strided_dataset<DataT, IdxT> {
+  using index_type = IdxT;
+  using value_type = DataT;
+  using typename strided_dataset<value_type, index_type>::view_type;
+  using storage_type =
+    mdarray<value_type, matrix_extent<index_type>, LayoutPolicy, ContainerPolicy>;
+  using mapping_type = typename view_type::mapping_type;
+  storage_type data;
+  mapping_type view_mapping;
+  owning_dataset(storage_type&& store, mapping_type view_mapping) noexcept
+    : data{std::move(store)}, view_mapping{view_mapping}
+  {
+  }
+
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
+  [[nodiscard]] auto view() const noexcept -> view_type final
+  {
+    return view_type{data.data_handle(), view_mapping};
+  };
+};
+
+/**
+ * @brief Contstruct a strided matrix from any mdarray or mdspan.
+ *
+ * This function constructs a non-owning view if the input satisfied two conditions:
+ *
+ *   1) The data is accessible from the current device
+ *   2) The memory layout is the same as expected (row-major matrix with the required stride)
+ *
+ * Otherwise, this function constructs an owning device matrix and copies the data.
+ * When the data is copied, padding elements are filled with zeroes.
+ *
+ * @tparam SrcT the source mdarray or mdspan
+ *
+ * @param[in] res raft resources handle
+ * @param[in] src the source mdarray or mdspan
+ * @param[in] required_stride the leading dimension (in elements)
+ * @return maybe owning current-device-accessible strided matrix
+ */
+template <typename SrcT>
+auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t required_stride)
+  -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>>
+{
+  using extents_type = typename SrcT::extents_type;
+  using value_type   = typename SrcT::value_type;
+  using index_type   = typename SrcT::index_type;
+  using layout_type  = typename SrcT::layout_type;
+  static_assert(extents_type::rank() == 2, "The input must be a matrix.");
+  static_assert(std::is_same_v<layout_type, layout_right> ||
+                  std::is_same_v<layout_type, layout_right_padded<value_type>> ||
+                  std::is_same_v<layout_type, layout_stride>,
+                "The input must be row-major");
+  RAFT_EXPECTS(src.extent(1) <= required_stride,
+               "The input row length must be not larger than the desired stride.");
+  cudaPointerAttributes ptr_attrs;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&ptr_attrs, src.data_handle()));
+  auto* device_ptr             = reinterpret_cast<value_type*>(ptr_attrs.devicePointer);
+  const uint32_t src_stride    = src.stride(0) > 0 ? src.stride(0) : src.extent(1);
+  const bool device_accessible = device_ptr != nullptr;
+  const bool row_major         = src.stride(1) <= 1;
+  const bool stride_matches    = required_stride == src_stride;
+
+  if (device_accessible && row_major && stride_matches) {
+    // Everything matches: make a non-owning dataset
+    return std::make_unique<non_owning_dataset<value_type, index_type>>(
+      make_device_strided_matrix_view<const value_type, index_type>(
+        device_ptr, src.extent(0), src.extent(1), required_stride));
+  }
+  // Something is wrong: have to make a copy and produce an owning dataset
+  auto out_layout =
+    make_strided_layout(src.extents(), std::array<index_type, 2>{required_stride, 1});
+  auto out_array = make_device_matrix<value_type, index_type>(res, src.extent(0), required_stride);
+
+  using out_mdarray_type          = decltype(out_array);
+  using out_layout_type           = typename out_mdarray_type::layout_type;
+  using out_container_policy_type = typename out_mdarray_type::container_policy_type;
+  using out_owning_type =
+    owning_dataset<value_type, index_type, out_layout_type, out_container_policy_type>;
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_array.data_handle(),
+                                0,
+                                out_array.size() * sizeof(value_type),
+                                resource::get_cuda_stream(res)));
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(),
+                                  sizeof(value_type) * required_stride,
+                                  src.data_handle(),
+                                  sizeof(value_type) * src_stride,
+                                  sizeof(value_type) * src.extent(1),
+                                  src.extent(0),
+                                  cudaMemcpyDefault,
+                                  resource::get_cuda_stream(res)));
+
+  return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
+}
+
+/**
+ * @brief Contstruct a strided matrix from any mdarray or mdspan.
+ *
+ * A variant `make_strided_dataset` that allows specifying the byte alignment instead of the
+ * explicit stride length.
+ *
+ * @tparam SrcT the source mdarray or mdspan
+ *
+ * @param[in] res raft resources handle
+ * @param[in] src the source mdarray or mdspan
+ * @param[in] align_bytes the required byte alignment for the dataset rows.
+ * @return maybe owning current-device-accessible strided matrix
+ */
+template <typename SrcT>
+auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t align_bytes = 16)
+  -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>>
+{
+  using value_type       = typename SrcT::value_type;
+  constexpr size_t kSize = sizeof(value_type);
+  uint32_t required_stride =
+    raft::round_up_safe<size_t>(src.extent(1) * kSize, std::lcm(align_bytes, kSize)) / kSize;
+  return make_strided_dataset(res, src, required_stride);
+}
+
+/** Parameters for VPQ compression. */
+struct vpq_params {
+  /**
+   * The bit length of the vector element after compression by PQ.
+   *
+   * Possible values: [4, 5, 6, 7, 8].
+   *
+   * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
+   * performance, but the lower the recall.
+   */
+  uint32_t pq_bits = 8;
+  /**
+   * The dimensionality of the vector after compression by PQ.
+   * When zero, an optimal value is selected using a heuristic.
+   *
+   * TODO: at the moment `dim` must be a multiple `pq_dim`.
+   */
+  uint32_t pq_dim = 0;
+  /**
+   * Vector Quantization (VQ) codebook size - number of "coarse cluster centers".
+   * When zero, an optimal value is selected using a heuristic.
+   */
+  uint32_t vq_n_centers = 0;
+  /** The number of iterations searching for kmeans centers (both VQ & PQ phases). */
+  uint32_t kmeans_n_iters = 25;
+  /**
+   * The fraction of data to use during iterative kmeans building (VQ phase).
+   * When zero, an optimal value is selected using a heuristic.
+   */
+  double vq_kmeans_trainset_fraction = 0;
+  /**
+   * The fraction of data to use during iterative kmeans building (PQ phase).
+   * When zero, an optimal value is selected using a heuristic.
+   */
+  double pq_kmeans_trainset_fraction = 0;
+};
+
+/**
+ * @brief VPQ compressed dataset.
+ *
+ * The dataset is compressed using two level quantization
+ *
+ *   1. Vector Quantization
+ *   2. Product Quantization of residuals
+ *
+ * @tparam MathT the type of elements in the codebooks
+ * @tparam IdxT type of the vector indices (represent dataset.extent(0))
+ *
+ */
+template <typename MathT, typename IdxT>
+struct vpq_dataset : public dataset<IdxT> {
+  /** Vector Quantization codebook - "coarse cluster centers". */
+  device_matrix<MathT, uint32_t, row_major> vq_code_book;
+  /** Product Quantization codebook - "fine cluster centers".  */
+  device_matrix<MathT, uint32_t, row_major> pq_code_book;
+  /** Compressed dataset.  */
+  device_matrix<uint8_t, IdxT, row_major> data;
+
+  vpq_dataset(device_matrix<MathT, uint32_t, row_major>&& vq_code_book,
+              device_matrix<MathT, uint32_t, row_major>&& pq_code_book,
+              device_matrix<uint8_t, IdxT, row_major>&& data)
+    : vq_code_book{std::move(vq_code_book)},
+      pq_code_book{std::move(pq_code_book)},
+      data{std::move(data)}
+  {
+  }
+
+  [[nodiscard]] auto n_rows() const noexcept -> IdxT final { return data.extent(0); }
+  [[nodiscard]] auto dim() const noexcept -> uint32_t final { return vq_code_book.extent(1); }
+  [[nodiscard]] auto is_owning() const noexcept -> bool final { return true; }
+
+  /** Row length of the encoded data in bytes. */
+  [[nodiscard]] constexpr inline auto encoded_row_length() const noexcept -> uint32_t
+  {
+    return data.extent(1);
+  }
+  /** The number of "coarse cluster centers" */
+  [[nodiscard]] constexpr inline auto vq_n_centers() const noexcept -> uint32_t
+  {
+    return vq_code_book.extent(0);
+  }
+  /** The bit length of an encoded vector element after compression by PQ. */
+  [[nodiscard]] constexpr inline auto pq_bits() const noexcept -> uint32_t
+  {
+    /*
+    NOTE: pq_bits and the book size
+
+    Normally, we'd store `pq_bits` as a part of the index.
+    However, we know there's an invariant `pq_n_centers = 1 << pq_bits`, i.e. the codebook size is
+    the same as the number of possible code values. Hence, we don't store the pq_bits and derive it
+    from the array dimensions instead.
+     */
+    auto pq_width = pq_n_centers();
+#ifdef __cpp_lib_bitops
+    return std::countr_zero(pq_width);
+#else
+    uint32_t pq_bits = 0;
+    while (pq_width > 1) {
+      pq_bits++;
+      pq_width >>= 1;
+    }
+    return pq_bits;
+#endif
+  }
+  /** The dimensionality of an encoded vector after compression by PQ. */
+  [[nodiscard]] constexpr inline auto pq_dim() const noexcept -> uint32_t
+  {
+    return raft::div_rounding_up_unsafe(dim(), pq_len());
+  }
+  /** Dimensionality of a subspaces, i.e. the number of vector components mapped to a subspace */
+  [[nodiscard]] constexpr inline auto pq_len() const noexcept -> uint32_t
+  {
+    return pq_code_book.extent(1);
+  }
+  /** The number of vectors in a PQ codebook (`1 << pq_bits`). */
+  [[nodiscard]] constexpr inline auto pq_n_centers() const noexcept -> uint32_t
+  {
+    return pq_code_book.extent(0);
+  }
+};
+
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index 08cc2beaeb..d91e45257e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "../../cagra_types.hpp"
+#include "../../vpq_dataset.cuh"
 #include "graph_core.cuh"
 
 #include <raft/core/device_mdarray.hpp>
@@ -344,6 +345,15 @@ index<T, IdxT> build(
   RAFT_LOG_INFO("Graph optimized, creating index");
   // Construct an index from dataset and optimized knn graph.
   if (construct_index_with_dataset) {
+    if (params.compression.has_value()) {
+      index<T, IdxT> idx(res, params.metric);
+      idx.update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
+      idx.update_dataset(
+        res,
+        // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later
+        neighbors::vpq_build<decltype(dataset), half, int64_t>(res, *params.compression, dataset));
+      return idx;
+    }
     return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));
   } else {
     // We just add the graph. User is expected to update dataset separately. This branch is used
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
index d7bd27222b..600c8785e0 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -23,6 +23,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
 #include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/detail/dataset_serialize.hpp>
 
 #include <cstddef>
 #include <cstdint>
@@ -31,7 +32,7 @@
 
 namespace raft::neighbors::cagra::detail {
 
-constexpr int serialization_version = 3;
+constexpr int serialization_version = 4;
 
 /**
  * Save the index to file.
@@ -65,26 +66,14 @@ void serialize(raft::resources const& res,
   serialize_scalar(res, os, index_.metric());
   serialize_mdspan(res, os, index_.graph());
 
-  include_dataset &= (index_.dataset().extent(0) > 0);
+  include_dataset &= (index_.data().n_rows() > 0);
 
   serialize_scalar(res, os, include_dataset);
   if (include_dataset) {
     RAFT_LOG_INFO("Saving CAGRA index with dataset");
-    auto dataset = index_.dataset();
-    // Remove padding before saving the dataset
-    auto host_dataset = make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.data_handle(),
-                                    sizeof(T) * dataset.stride(0),
-                                    sizeof(T) * host_dataset.extent(1),
-                                    dataset.extent(0),
-                                    cudaMemcpyDefault,
-                                    resource::get_cuda_stream(res)));
-    resource::sync_stream(res);
-    serialize_mdspan(res, os, host_dataset.view());
+    neighbors::detail::serialize(res, os, index_.data());
   } else {
-    RAFT_LOG_INFO("Saving CAGRA index WITHOUT dataset");
+    RAFT_LOG_DEBUG("Saving CAGRA index WITHOUT dataset");
   }
 }
 
@@ -256,19 +245,13 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index<T, IdxT>
   auto graph = raft::make_host_matrix<IdxT, int64_t>(n_rows, graph_degree);
   deserialize_mdspan(res, is, graph.view());
 
+  index<T, IdxT> idx(res, metric);
+  idx.update_graph(res, raft::make_const_mdspan(graph.view()));
   bool has_dataset = deserialize_scalar<bool>(res, is);
   if (has_dataset) {
-    auto dataset = raft::make_host_matrix<T, int64_t>(n_rows, dim);
-    deserialize_mdspan(res, is, dataset.view());
-    return index<T, IdxT>(
-      res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view()));
-  } else {
-    // create a new index with no dataset - the user must supply via update_dataset themselves
-    // later (this avoids allocating GPU memory in the meantime)
-    index<T, IdxT> idx(res, metric);
-    idx.update_graph(res, raft::make_const_mdspan(graph.view()));
-    return idx;
+    idx.update_dataset(res, neighbors::detail::deserialize_dataset<int64_t>(res, is));
   }
+  return idx;
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/dataset_serialize.hpp b/cpp/include/raft/neighbors/detail/dataset_serialize.hpp
new file mode 100644
index 0000000000..a6a6ae59a5
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/dataset_serialize.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../dataset.hpp"
+
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/serialize.hpp>
+
+#include <cuda_fp16.h>
+
+#include <fstream>
+#include <memory>
+
+namespace raft::neighbors::detail {
+
+using dataset_instance_tag                              = uint32_t;
+constexpr dataset_instance_tag kSerializeEmptyDataset   = 1;
+constexpr dataset_instance_tag kSerializeStridedDataset = 2;
+constexpr dataset_instance_tag kSerializeVPQDataset     = 3;
+
+template <typename IdxT>
+void serialize(const raft::resources& res, std::ostream& os, const empty_dataset<IdxT>& dataset)
+{
+  serialize_scalar(res, os, dataset.suggested_dim);
+}
+
+template <typename DataT, typename IdxT>
+void serialize(const raft::resources& res,
+               std::ostream& os,
+               const strided_dataset<DataT, IdxT>& dataset)
+{
+  auto n_rows = dataset.n_rows();
+  auto dim    = dataset.dim();
+  auto stride = dataset.stride();
+  serialize_scalar(res, os, n_rows);
+  serialize_scalar(res, os, dim);
+  serialize_scalar(res, os, stride);
+  // Remove padding before saving the dataset
+  auto src = dataset.view();
+  auto dst = make_host_matrix<DataT, IdxT>(n_rows, dim);
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(dst.data_handle(),
+                                  sizeof(DataT) * dim,
+                                  src.data_handle(),
+                                  sizeof(DataT) * stride,
+                                  sizeof(DataT) * dim,
+                                  n_rows,
+                                  cudaMemcpyDefault,
+                                  resource::get_cuda_stream(res)));
+  resource::sync_stream(res);
+  serialize_mdspan(res, os, dst.view());
+}
+
+template <typename MathT, typename IdxT>
+void serialize(const raft::resources& res,
+               std::ostream& os,
+               const vpq_dataset<MathT, IdxT>& dataset)
+{
+  serialize_scalar(res, os, dataset.n_rows());
+  serialize_scalar(res, os, dataset.dim());
+  serialize_scalar(res, os, dataset.vq_n_centers());
+  serialize_scalar(res, os, dataset.pq_n_centers());
+  serialize_scalar(res, os, dataset.pq_len());
+  serialize_scalar(res, os, dataset.encoded_row_length());
+  serialize_mdspan(res, os, make_const_mdspan(dataset.vq_code_book.view()));
+  serialize_mdspan(res, os, make_const_mdspan(dataset.pq_code_book.view()));
+  serialize_mdspan(res, os, make_const_mdspan(dataset.data.view()));
+}
+
+template <typename IdxT>
+void serialize(const raft::resources& res, std::ostream& os, const dataset<IdxT>& dataset)
+{
+  if (auto x = dynamic_cast<const empty_dataset<IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeEmptyDataset);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<float, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_32F);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<half, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_16F);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<int8_t, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_8I);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const strided_dataset<uint8_t, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeStridedDataset);
+    serialize_scalar(res, os, CUDA_R_8U);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const vpq_dataset<float, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeVPQDataset);
+    serialize_scalar(res, os, CUDA_R_32F);
+    return serialize(res, os, *x);
+  }
+  if (auto x = dynamic_cast<const vpq_dataset<half, IdxT>*>(&dataset); x != nullptr) {
+    serialize_scalar(res, os, kSerializeVPQDataset);
+    serialize_scalar(res, os, CUDA_R_16F);
+    return serialize(res, os, *x);
+  }
+  RAFT_FAIL("unsupported dataset type.");
+}
+
+template <typename IdxT>
+auto deserialize_empty(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<empty_dataset<IdxT>>
+{
+  auto suggested_dim = deserialize_scalar<uint32_t>(res, is);
+  return std::make_unique<empty_dataset<IdxT>>(suggested_dim);
+}
+
+template <typename DataT, typename IdxT>
+auto deserialize_strided(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<strided_dataset<DataT, IdxT>>
+{
+  auto n_rows     = deserialize_scalar<IdxT>(res, is);
+  auto dim        = deserialize_scalar<uint32_t>(res, is);
+  auto stride     = deserialize_scalar<uint32_t>(res, is);
+  auto host_array = make_host_matrix<DataT, IdxT>(n_rows, dim);
+  deserialize_mdspan(res, is, host_array.view());
+  return make_strided_dataset(res, host_array, stride);
+}
+
+template <typename MathT, typename IdxT>
+auto deserialize_vpq(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<vpq_dataset<MathT, IdxT>>
+{
+  auto n_rows             = deserialize_scalar<IdxT>(res, is);
+  auto dim                = deserialize_scalar<uint32_t>(res, is);
+  auto vq_n_centers       = deserialize_scalar<uint32_t>(res, is);
+  auto pq_n_centers       = deserialize_scalar<uint32_t>(res, is);
+  auto pq_len             = deserialize_scalar<uint32_t>(res, is);
+  auto encoded_row_length = deserialize_scalar<uint32_t>(res, is);
+
+  auto vq_code_book = make_device_matrix<MathT, uint32_t, row_major>(res, vq_n_centers, dim);
+  auto pq_code_book = make_device_matrix<MathT, uint32_t, row_major>(res, pq_n_centers, pq_len);
+  auto data         = make_device_matrix<uint8_t, IdxT, row_major>(res, n_rows, encoded_row_length);
+
+  deserialize_mdspan(res, is, vq_code_book.view());
+  deserialize_mdspan(res, is, pq_code_book.view());
+  deserialize_mdspan(res, is, data.view());
+
+  return std::make_unique<vpq_dataset<MathT, IdxT>>(
+    std::move(vq_code_book), std::move(pq_code_book), std::move(data));
+}
+
+template <typename IdxT>
+auto deserialize_dataset(raft::resources const& res, std::istream& is)
+  -> std::unique_ptr<dataset<IdxT>>
+{
+  switch (deserialize_scalar<dataset_instance_tag>(res, is)) {
+    case kSerializeEmptyDataset: return deserialize_empty<IdxT>(res, is);
+    case kSerializeStridedDataset:
+      switch (deserialize_scalar<cudaDataType_t>(res, is)) {
+        case CUDA_R_32F: return deserialize_strided<float, IdxT>(res, is);
+        case CUDA_R_16F: return deserialize_strided<half, IdxT>(res, is);
+        case CUDA_R_8I: return deserialize_strided<int8_t, IdxT>(res, is);
+        case CUDA_R_8U: return deserialize_strided<uint8_t, IdxT>(res, is);
+        default: break;
+      }
+    case kSerializeVPQDataset:
+      switch (deserialize_scalar<cudaDataType_t>(res, is)) {
+        case CUDA_R_32F: return deserialize_vpq<float, IdxT>(res, is);
+        case CUDA_R_16F: return deserialize_vpq<half, IdxT>(res, is);
+        default: break;
+      }
+    default: break;
+  }
+  RAFT_FAIL("Failed to deserialize dataset: unsupported combination of instance tags.");
+}
+
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
new file mode 100644
index 0000000000..f6cd2a1ceb
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../dataset.hpp"
+
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/neighbors/detail/ivf_pq_build.cuh>  // pq_bits-bitfield
+#include <raft/spatial/knn/detail/ann_utils.cuh>   // utils::mapping etc
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+// A temporary stub till https://github.com/rapidsai/raft/pull/2077 is re-merged
+namespace raft::util {
+
+/**
+ * Subsample the dataset to create a training set.
+ *
+ * @tparam DatasetT a row-major mdspan or mdarray (device or host)
+ *
+ * @param res raft handle
+ * @param dataset input row-major mdspan or mdarray (device or host)
+ * @param n_samples the size of the output mdarray
+ *
+ * @return a newly allocated subset of the dataset.
+ */
+template <typename DatasetT>
+auto subsample(raft::resources const& res,
+               const DatasetT& dataset,
+               typename DatasetT::index_type n_samples)
+  -> raft::device_matrix<typename DatasetT::value_type, typename DatasetT::index_type>
+{
+  using value_type = typename DatasetT::value_type;
+  using index_type = typename DatasetT::index_type;
+  static_assert(std::is_same_v<typename DatasetT::layout_type, row_major>,
+                "Only row-major layout is supported at the moment");
+  RAFT_EXPECTS(n_samples <= dataset.extent(0),
+               "The number of samples must be smaller than the number of input rows in the current "
+               "implementation.");
+  size_t dim            = dataset.extent(1);
+  size_t trainset_ratio = dataset.extent(0) / n_samples;
+  auto result = raft::make_device_matrix<value_type, index_type>(res, n_samples, dataset.extent(1));
+
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(result.data_handle(),
+                                  sizeof(value_type) * dim,
+                                  dataset.data_handle(),
+                                  sizeof(value_type) * dim * trainset_ratio,
+                                  sizeof(value_type) * dim,
+                                  n_samples,
+                                  cudaMemcpyDefault,
+                                  raft::resource::get_cuda_stream(res)));
+  return result;
+}
+
+}  // namespace raft::util
+
+namespace raft::neighbors::detail {
+
+template <typename DatasetT>
+auto fill_missing_params_heuristics(const vpq_params& params, const DatasetT& dataset) -> vpq_params
+{
+  vpq_params r  = params;
+  double n_rows = dataset.extent(0);
+  size_t dim    = dataset.extent(1);
+  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{4}); }
+  if (r.pq_bits == 0) { r.pq_bits = 8; }
+  if (r.vq_n_centers == 0) { r.vq_n_centers = raft::round_up_safe<uint32_t>(std::sqrt(n_rows), 8); }
+  if (r.vq_kmeans_trainset_fraction == 0) {
+    double vq_trainset_size       = 100.0 * r.vq_n_centers;
+    r.vq_kmeans_trainset_fraction = std::min(1.0, vq_trainset_size / n_rows);
+  }
+  if (r.pq_kmeans_trainset_fraction == 0) {
+    // NB: we'll have actually `pq_dim` times more samples than this
+    //     (because the dataset is reinterpreted as `[n_rows * pq_dim, pq_len]`)
+    double pq_trainset_size       = 1000.0 * (1u << r.pq_bits);
+    r.pq_kmeans_trainset_fraction = std::min(1.0, pq_trainset_size / n_rows);
+  }
+  return r;
+}
+
+template <typename T, typename DatasetT>
+auto transform_data(const raft::resources& res, DatasetT dataset)
+  -> device_mdarray<T, typename DatasetT::extents_type, typename DatasetT::layout_type>
+{
+  using index_type       = typename DatasetT::index_type;
+  using extents_type     = typename DatasetT::extents_type;
+  using layout_type      = typename DatasetT::layout_type;
+  using out_mdarray_type = device_mdarray<T, extents_type, layout_type>;
+  if constexpr (std::is_same_v<out_mdarray_type, std::decay<DatasetT>>) { return dataset; }
+
+  auto result = raft::make_device_mdarray<T, index_type, layout_type>(res, dataset.extents());
+
+  linalg::map(res,
+              result.view(),
+              spatial::knn::detail::utils::mapping<T>{},
+              raft::make_const_mdspan(dataset.view()));
+
+  return result;
+}
+
+/** Fix the internal indexing type to avoid integer underflows/overflows */
+using ix_t = int64_t;
+
+template <typename MathT, typename DatasetT>
+auto train_vq(const raft::resources& res, const vpq_params& params, const DatasetT& dataset)
+  -> device_matrix<MathT, uint32_t, row_major>
+{
+  const ix_t n_rows       = dataset.extent(0);
+  const ix_t vq_n_centers = params.vq_n_centers;
+  const ix_t dim          = dataset.extent(1);
+  const ix_t n_rows_train = n_rows * params.vq_kmeans_trainset_fraction;
+
+  // Subsample the dataset and transform into the required type if necessary
+  auto vq_trainset = raft::util::subsample(res, dataset, n_rows_train);
+  auto vq_centers  = raft::make_device_matrix<MathT, uint32_t, row_major>(res, vq_n_centers, dim);
+
+  using kmeans_in_type = typename DatasetT::value_type;
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.n_iters = params.kmeans_n_iters;
+  kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+  auto vq_centers_view =
+    raft::make_device_matrix_view<MathT, ix_t>(vq_centers.data_handle(), vq_n_centers, dim);
+  auto vq_trainset_view = raft::make_device_matrix_view<const kmeans_in_type, ix_t>(
+    vq_trainset.data_handle(), n_rows_train, dim);
+  raft::cluster::kmeans_balanced::fit<kmeans_in_type, MathT, ix_t>(
+    res,
+    kmeans_params,
+    vq_trainset_view,
+    vq_centers_view,
+    spatial::knn::detail::utils::mapping<MathT>{});
+
+  return vq_centers;
+}
+
+template <typename LabelT, typename DatasetT, typename VqCentersT>
+auto predict_vq(const raft::resources& res, const DatasetT& dataset, const VqCentersT& vq_centers)
+  -> device_vector<LabelT, typename DatasetT::index_type>
+{
+  using kmeans_data_type = typename DatasetT::value_type;
+  using kmeans_math_type = typename VqCentersT::value_type;
+  using index_type       = typename DatasetT::index_type;
+  using label_type       = LabelT;
+
+  auto vq_labels = raft::make_device_vector<label_type, index_type>(res, dataset.extent(0));
+
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.metric = raft::distance::DistanceType::L2Expanded;
+
+  auto vq_centers_view = raft::make_device_matrix_view<const kmeans_math_type, index_type>(
+    vq_centers.data_handle(), vq_centers.extent(0), vq_centers.extent(1));
+
+  auto vq_dataset_view = raft::make_device_matrix_view<const kmeans_data_type, index_type>(
+    dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+
+  raft::cluster::kmeans_balanced::
+    predict<kmeans_data_type, kmeans_math_type, index_type, label_type>(
+      res,
+      kmeans_params,
+      vq_dataset_view,
+      vq_centers_view,
+      vq_labels.view(),
+      spatial::knn::detail::utils::mapping<kmeans_math_type>{});
+
+  return vq_labels;
+}
+
+template <typename MathT, typename DatasetT>
+auto train_pq(const raft::resources& res,
+              const vpq_params& params,
+              const DatasetT& dataset,
+              const device_matrix_view<const MathT, uint32_t, row_major>& vq_centers)
+  -> device_matrix<MathT, uint32_t, row_major>
+{
+  const ix_t n_rows       = dataset.extent(0);
+  const ix_t dim          = dataset.extent(1);
+  const ix_t pq_dim       = params.pq_dim;
+  const ix_t pq_bits      = params.pq_bits;
+  const ix_t pq_n_centers = ix_t{1} << pq_bits;
+  const ix_t pq_len       = raft::div_rounding_up_safe(dim, pq_dim);
+  const ix_t n_rows_train = n_rows * params.pq_kmeans_trainset_fraction;
+
+  // Subsample the dataset and transform into the required type if necessary
+  auto pq_trainset = transform_data<MathT>(res, raft::util::subsample(res, dataset, n_rows_train));
+
+  // Subtract VQ centers
+  {
+    auto vq_labels   = predict_vq<uint32_t>(res, pq_trainset, vq_centers);
+    using index_type = typename DatasetT::index_type;
+    linalg::map_offset(
+      res,
+      pq_trainset.view(),
+      [labels = vq_labels.view(), centers = vq_centers, dim] __device__(index_type off, MathT x) {
+        index_type i = off / dim;
+        index_type j = off % dim;
+        return x - centers(labels(i), j);
+      },
+      raft::make_const_mdspan(pq_trainset.view()));
+  }
+
+  auto pq_centers = raft::make_device_matrix<MathT, uint32_t, row_major>(res, pq_n_centers, pq_len);
+
+  // Train PQ centers
+  {
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+
+    auto pq_centers_view =
+      raft::make_device_matrix_view<MathT, ix_t>(pq_centers.data_handle(), pq_n_centers, pq_len);
+
+    auto pq_trainset_view = raft::make_device_matrix_view<const MathT, ix_t>(
+      pq_trainset.data_handle(), n_rows_train * pq_dim, pq_len);
+
+    raft::cluster::kmeans_balanced::fit<MathT, MathT, ix_t>(
+      res, kmeans_params, pq_trainset_view, pq_centers_view);
+  }
+
+  return pq_centers;
+}
+
+template <uint32_t SubWarpSize, typename DataT, typename MathT, typename IdxT, typename LabelT>
+__device__ auto compute_code(device_matrix_view<const DataT, IdxT, row_major> dataset,
+                             device_matrix_view<const MathT, uint32_t, row_major> vq_centers,
+                             device_matrix_view<const MathT, uint32_t, row_major> pq_centers,
+                             IdxT i,
+                             uint32_t j,
+                             LabelT vq_label) -> uint8_t
+{
+  auto data_mapping = spatial::knn::detail::utils::mapping<MathT>{};
+  uint32_t lane_id  = Pow2<SubWarpSize>::mod(laneId());
+
+  const uint32_t pq_book_size = pq_centers.extent(0);
+  const uint32_t pq_len       = pq_centers.extent(1);
+  float min_dist              = std::numeric_limits<float>::infinity();
+  uint8_t code                = 0;
+  // calculate the distance for each PQ cluster, find the minimum for each thread
+  for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) {
+    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+    float d = 0.0f;
+    for (uint32_t k = 0; k < pq_len; k++) {
+      auto jk = j * pq_len + k;
+      auto x  = data_mapping(dataset(i, jk)) - vq_centers(vq_label, jk);
+      auto t  = x - pq_centers(l, k);
+      d += t * t;
+    }
+    if (d < min_dist) {
+      min_dist = d;
+      code     = uint8_t(l);
+    }
+  }
+  // reduce among threads
+#pragma unroll
+  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+    const auto other_code = shfl_xor(code, stride, SubWarpSize);
+    if (other_dist < min_dist) {
+      min_dist = other_dist;
+      code     = other_code;
+    }
+  }
+  return code;
+}
+
+template <uint32_t BlockSize,
+          uint32_t PqBits,
+          typename DataT,
+          typename MathT,
+          typename IdxT,
+          typename LabelT>
+__launch_bounds__(BlockSize) RAFT_KERNEL
+  process_and_fill_codes_kernel(device_matrix_view<uint8_t, IdxT, row_major> out_codes,
+                                device_matrix_view<const DataT, IdxT, row_major> dataset,
+                                device_matrix_view<const MathT, uint32_t, row_major> vq_centers,
+                                device_vector_view<const LabelT, IdxT, row_major> vq_labels,
+                                device_matrix_view<const MathT, uint32_t, row_major> pq_centers)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  using subwarp_align             = Pow2<kSubWarpSize>;
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
+  if (row_ix >= out_codes.extent(0)) { return; }
+
+  const uint32_t pq_dim = raft::div_rounding_up_unsafe(vq_centers.extent(1), pq_centers.extent(1));
+
+  const uint32_t lane_id = Pow2<kSubWarpSize>::mod(threadIdx.x);
+  const LabelT vq_label  = vq_labels(row_ix);
+
+  // write label
+  auto* out_label_ptr = reinterpret_cast<LabelT*>(&out_codes(row_ix, 0));
+  if (lane_id == 0) { *out_label_ptr = vq_label; }
+
+  auto* out_codes_ptr = reinterpret_cast<uint8_t*>(out_label_ptr + 1);
+  ivf_pq::detail::bitfield_view_t<PqBits> code_view{out_codes_ptr};
+  for (uint32_t j = 0; j < pq_dim; j++) {
+    // find PQ label
+    uint8_t code = compute_code<kSubWarpSize>(dataset, vq_centers, pq_centers, row_ix, j, vq_label);
+    // TODO: this writes in global memory one byte per warp, which is very slow.
+    //  It's better to keep the codes in the shared memory or registers and dump them at once.
+    if (lane_id == 0) { code_view[j] = code; }
+  }
+}
+
+template <typename MathT, typename IdxT, typename DatasetT>
+auto process_and_fill_codes(const raft::resources& res,
+                            const vpq_params& params,
+                            const DatasetT& dataset,
+                            device_matrix_view<const MathT, uint32_t, row_major> vq_centers,
+                            device_matrix_view<const MathT, uint32_t, row_major> pq_centers)
+  -> device_matrix<uint8_t, IdxT, row_major>
+{
+  using data_t     = typename DatasetT::value_type;
+  using cdataset_t = vpq_dataset<MathT, IdxT>;
+  using label_t    = uint32_t;
+
+  const ix_t n_rows       = dataset.extent(0);
+  const ix_t dim          = dataset.extent(1);
+  const ix_t pq_dim       = params.pq_dim;
+  const ix_t pq_bits      = params.pq_bits;
+  const ix_t pq_n_centers = ix_t{1} << pq_bits;
+  // NB: codes must be aligned at least to sizeof(label_t) to be able to read labels.
+  const ix_t codes_rowlen =
+    sizeof(label_t) * (1 + raft::div_rounding_up_safe<ix_t>(pq_dim * pq_bits, 8 * sizeof(label_t)));
+
+  auto codes = raft::make_device_matrix<uint8_t, IdxT, row_major>(res, n_rows, codes_rowlen);
+
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  // TODO: with scaling workspace we could choose the batch size dynamically
+  constexpr ix_t kReasonableMaxBatchSize = 65536;
+  constexpr ix_t kBlockSize              = 256;
+  const ix_t threads_per_vec             = std::min<ix_t>(WarpSize, pq_n_centers);
+  dim3 threads(kBlockSize, 1, 1);
+  ix_t max_batch_size = std::min<ix_t>(n_rows, kReasonableMaxBatchSize);
+  auto kernel         = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return process_and_fill_codes_kernel<kBlockSize, 4, data_t, MathT, IdxT, label_t>;
+      case 5: return process_and_fill_codes_kernel<kBlockSize, 5, data_t, MathT, IdxT, label_t>;
+      case 6: return process_and_fill_codes_kernel<kBlockSize, 6, data_t, MathT, IdxT, label_t>;
+      case 7: return process_and_fill_codes_kernel<kBlockSize, 7, data_t, MathT, IdxT, label_t>;
+      case 8: return process_and_fill_codes_kernel<kBlockSize, 8, data_t, MathT, IdxT, label_t>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(pq_bits);
+  for (const auto& batch :
+       spatial::knn::detail::utils::batch_load_iterator(dataset.data_handle(),
+                                                        n_rows,
+                                                        dim,
+                                                        max_batch_size,
+                                                        stream,
+                                                        rmm::mr::get_current_device_resource())) {
+    auto batch_view = raft::make_device_matrix_view(batch.data(), ix_t(batch.size()), dim);
+    auto labels     = predict_vq<label_t>(res, batch_view, vq_centers);
+    dim3 blocks(div_rounding_up_safe<ix_t>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+    kernel<<<blocks, threads, 0, stream>>>(
+      make_device_matrix_view<uint8_t, IdxT>(
+        codes.data_handle() + batch.offset() * codes_rowlen, batch.size(), codes_rowlen),
+      batch_view,
+      vq_centers,
+      make_const_mdspan(labels.view()),
+      pq_centers);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+  return codes;
+}
+
+template <typename NewMathT, typename OldMathT, typename IdxT>
+auto vpq_convert_math_type(const raft::resources& res, vpq_dataset<OldMathT, IdxT>&& src)
+  -> vpq_dataset<NewMathT, IdxT>
+{
+  auto vq_code_book = make_device_mdarray<NewMathT>(res, src.vq_code_book.extents());
+  auto pq_code_book = make_device_mdarray<NewMathT>(res, src.pq_code_book.extents());
+
+  linalg::map(res,
+              vq_code_book.view(),
+              spatial::knn::detail::utils::mapping<NewMathT>{},
+              raft::make_const_mdspan(src.vq_code_book.view()));
+  linalg::map(res,
+              pq_code_book.view(),
+              spatial::knn::detail::utils::mapping<NewMathT>{},
+              raft::make_const_mdspan(src.pq_code_book.view()));
+  return vpq_dataset<NewMathT, IdxT>{
+    std::move(vq_code_book), std::move(pq_code_book), std::move(src.data)};
+}
+
+template <typename DatasetT, typename MathT, typename IdxT>
+auto vpq_build(const raft::resources& res, const vpq_params& params, const DatasetT& dataset)
+  -> vpq_dataset<MathT, IdxT>
+{
+  // Use a heuristic to impute missing parameters.
+  auto ps = fill_missing_params_heuristics(params, dataset);
+
+  // Train codes
+  auto vq_code_book = train_vq<MathT>(res, ps, dataset);
+  auto pq_code_book =
+    train_pq<MathT>(res, ps, dataset, raft::make_const_mdspan(vq_code_book.view()));
+
+  // Encode dataset
+  auto codes = process_and_fill_codes<MathT, IdxT>(res,
+                                                   ps,
+                                                   dataset,
+                                                   raft::make_const_mdspan(vq_code_book.view()),
+                                                   raft::make_const_mdspan(pq_code_book.view()));
+
+  return vpq_dataset<MathT, IdxT>{
+    std::move(vq_code_book), std::move(pq_code_book), std::move(codes)};
+}
+
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/vpq_dataset.cuh b/cpp/include/raft/neighbors/vpq_dataset.cuh
new file mode 100644
index 0000000000..73ee6c52ed
--- /dev/null
+++ b/cpp/include/raft/neighbors/vpq_dataset.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "dataset.hpp"
+#include "detail/vpq_dataset.cuh"
+
+#include <raft/core/resources.hpp>
+
+namespace raft::neighbors {
+
+/**
+ * @brief Compress a dataset for use in CAGRA-Q search in place of the original data.
+ *
+ * @tparam DatasetT a row-major mdspan or mdarray (device or host).
+ * @tparam MathT a type of the codebook elements and internal math ops.
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] params VQ and PQ parameters for compressing the data
+ * @param[in] dataset a row-major mdspan or mdarray (device or host) [n_rows, dim].
+ */
+template <typename DatasetT,
+          typename MathT = typename DatasetT::value_type,
+          typename IdxT  = typename DatasetT::index_type>
+auto vpq_build(const raft::resources& res, const vpq_params& params, const DatasetT& dataset)
+  -> vpq_dataset<MathT, IdxT>
+{
+  if constexpr (std::is_same_v<MathT, half>) {
+    return detail::vpq_convert_math_type<half, float, IdxT>(
+      res, detail::vpq_build<DatasetT, float, IdxT>(res, params, dataset));
+  } else {
+    return detail::vpq_build<DatasetT, MathT, IdxT>(res, params, dataset);
+  }
+}
+
+}  // namespace raft::neighbors