diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 1a563d213e..36ceccc36f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <raft/neighbors/detail/ivf_pq_codepacking.cuh>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
@@ -60,63 +61,6 @@ namespace raft::neighbors::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/** A chunk of PQ-encoded vector managed by one CUDA thread. */
-using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
-
-namespace {
-
-/**
- * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
- *
- * @tparam Bits number of bits comprising the value.
- */
-template <uint32_t Bits>
-struct bitfield_ref_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
-  uint8_t* ptr;
-  uint32_t offset;
-
-  constexpr operator uint8_t()  // NOLINT
-  {
-    auto pair = static_cast<uint16_t>(ptr[0]);
-    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
-    return static_cast<uint8_t>((pair >> offset) & kMask);
-  }
-
-  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
-  {
-    if (offset + Bits > 8) {
-      auto pair = static_cast<uint16_t>(ptr[0]);
-      pair |= static_cast<uint16_t>(ptr[1]) << 8;
-      pair &= ~(static_cast<uint16_t>(kMask) << offset);
-      pair |= static_cast<uint16_t>(code) << offset;
-      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
-      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
-    } else {
-      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
-    }
-    return *this;
-  }
-};
-
-/**
- * View a byte array as an array of unsigned integers of custom small bit size.
- *
- * @tparam Bits number of bits comprising a single element of the array.
- */
-template <uint32_t Bits>
-struct bitfield_view_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  uint8_t* raw;
-
-  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
-  {
-    uint32_t bit_offset = i * Bits;
-    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
-  }
-};
-
 template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
@@ -162,8 +106,6 @@ void copy_warped(T* out,
     <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
 
-}  // namespace
-
 /**
  * @brief Fill-in a random orthogonal transformation matrix.
  *
@@ -276,7 +218,7 @@ void flat_compute_residuals(
   device_matrix_view<const float, uint32_t, row_major> rotation_matrix,  // [rot_dim, dim]
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
   const T* dataset,                                                      // [n_rows, dim]
-  const uint32_t* labels,                                                // [n_rows]
+  std::variant<uint32_t, const uint32_t*> labels,                        // [n_rows]
   rmm::mr::device_memory_resource* device_memory)
 {
   auto stream  = handle.get_stream();
@@ -287,7 +229,9 @@ void flat_compute_residuals(
   linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) {
     auto row_ix = i / dim;
     auto el_ix  = i % dim;
-    auto label  = labels[row_ix];
+    auto label  = std::holds_alternative<uint32_t>(labels)
+                    ? std::get<uint32_t>(labels)
+                    : std::get<const uint32_t*>(labels)[row_ix];
     return utils::mapping<float>{}(dataset[i]) - centers(label, el_ix);
   });
 
@@ -558,11 +502,363 @@ void train_per_cluster(raft::device_resources const& handle,
 }
 
 /**
- * Compute the code: find the closest cluster in each pq_dim-subspace.
+ * A helper function: given the dataset in the rotated space
+ *  [n_rows, rot_dim] = [n_rows, pq_dim * pq_len],
+ * reinterpret the last dimension as two: [n_rows, pq_dim, pq_len]
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param vectors input data [n_rows, rot_dim]
+ * @param pq_centers codebook (used to infer the structure - pq_len)
+ * @return reinterpreted vectors [n_rows, pq_dim, pq_len]
+ */
+template <typename T, typename IdxT>
+static __device__ auto reinterpret_vectors(
+  device_matrix_view<T, IdxT, row_major> vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers)
+  -> device_mdspan<T, extent_3d<IdxT>, row_major>
+{
+  const uint32_t pq_len = pq_centers.extent(1);
+  const uint32_t pq_dim = vectors.extent(1) / pq_len;
+  using layout_t        = typename decltype(vectors)::layout_type;
+  using accessor_t      = typename decltype(vectors)::accessor_type;
+  return mdspan<T, extent_3d<IdxT>, layout_t, accessor_t>(
+    vectors.data_handle(), extent_3d<IdxT>{vectors.extent(0), pq_dim, pq_len});
+}
+
+/**
+ * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes
+ * one-per-byte. That is, independent of the code width (pq_bits), one code uses
+ * the whole byte, hence one vectors uses pq_dim bytes.
+ */
+struct unpack_codes {
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_codes the destination for the read codes.
+   */
+  __device__ inline unpack_codes(device_matrix_view<uint8_t, uint32_t, row_major> out_codes)
+    : out_codes{out_codes}
+  {
+  }
+
+  /**  Write j-th component (code) of the i-th vector into the output array. */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    out_codes(i, j) = code;
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_codes.extent(1);
+  auto unpack_action    = unpack_codes{out_codes};
+  run_on_list<PqBits>(in_list_data, offset_or_indices, out_codes.extent(0), pq_dim, unpack_action);
+}
+
+/**
+ * Unpack flat PQ codes from an existing list by the given offset.
+ *
+ * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] list_data the packed ivf::list data.
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void unpack_list_data(
+  device_matrix_view<uint8_t, uint32_t, row_major> codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return unpack_list_data_kernel<kBlockSize, 4>;
+      case 5: return unpack_list_data_kernel<kBlockSize, 5>;
+      case 6: return unpack_list_data_kernel<kBlockSize, 6>;
+      case 7: return unpack_list_data_kernel<kBlockSize, 7>;
+      case 8: return unpack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(codes, list_data, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Unpack the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  unpack_list_data(out_codes,
+                   index.lists()[label]->data.view(),
+                   offset_or_indices,
+                   index.pq_bits(),
+                   res.get_stream());
+}
+
+/** A consumer for the `run_on_list` and `run_on_vector` that approximates the original input data.
+ */
+struct reconstruct_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  uint32_t pq_len;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> centers_rot;
+  device_mdspan<float, extent_3d<uint32_t>, row_major> out_vectors;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_vectors the destination for the decoded vectors.
+   * @param[in] pq_centers the codebook
+   * @param[in] centers_rot
+   * @param[in] codebook_kind
+   * @param[in] cluster_ix label/id of the cluster.
+   */
+  __device__ inline reconstruct_vectors(
+    device_matrix_view<float, uint32_t, row_major> out_vectors,
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, uint32_t, row_major> centers_rot,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_len{pq_centers.extent(1)},
+      pq_centers{pq_centers},
+      centers_rot{reinterpret_vectors(centers_rot, pq_centers)},
+      out_vectors{reinterpret_vectors(out_vectors, pq_centers)}
+  {
+  }
+
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
+    }
+    for (uint32_t k = 0; k < pq_len; k++) {
+      out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k);
+    }
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
+  device_matrix_view<float, uint32_t, row_major> out_vectors,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  device_matrix_view<const float, uint32_t, row_major> centers_rot,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1);
+  auto reconstruct_action =
+    reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix};
+  run_on_list<PqBits>(
+    in_list_data, offset_or_indices, out_vectors.extent(0), pq_dim, reconstruct_action);
+}
+
+/** Decode the list data; see the public interface for the api and usage. */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = out_vectors.extent(0);
+  if (n_rows == 0) { return; }
+  auto& list = index.lists()[label];
+  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
+    auto n_skip = std::get<uint32_t>(offset_or_indices);
+    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+    // to avoid an extra device-host data copy and the stream sync.
+    RAFT_EXPECTS(n_skip + n_rows <= list->size.load(),
+                 "offset + output size must be not bigger than the cluster size.");
+  }
+
+  auto tmp = make_device_mdarray<float>(
+    res, res.get_workspace_resource(), make_extents<uint32_t>(n_rows, index.rot_dim()));
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return reconstruct_list_data_kernel<kBlockSize, 4>;
+      case 5: return reconstruct_list_data_kernel<kBlockSize, 5>;
+      case 6: return reconstruct_list_data_kernel<kBlockSize, 6>;
+      case 7: return reconstruct_list_data_kernel<kBlockSize, 7>;
+      case 8: return reconstruct_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(tmp.view(),
+                                                   list->data.view(),
+                                                   index.pq_centers(),
+                                                   index.centers_rot(),
+                                                   index.codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  float* out_float_ptr = nullptr;
+  rmm::device_uvector<float> out_float_buf(0, res.get_stream(), res.get_workspace_resource());
+  if constexpr (std::is_same_v<T, float>) {
+    out_float_ptr = out_vectors.data_handle();
+  } else {
+    out_float_buf.resize(size_t{n_rows} * size_t{index.dim()}, res.get_stream());
+    out_float_ptr = out_float_buf.data();
+  }
+  // Rotate the results back to the original space
+  float alpha = 1.0;
+  float beta  = 0.0;
+  linalg::gemm(res,
+               false,
+               false,
+               index.dim(),
+               n_rows,
+               index.rot_dim(),
+               &alpha,
+               index.rotation_matrix().data_handle(),
+               index.dim(),
+               tmp.data_handle(),
+               index.rot_dim(),
+               &beta,
+               out_float_ptr,
+               index.dim(),
+               res.get_stream());
+  // Transform the data to the original type, if necessary
+  if constexpr (!std::is_same_v<T, float>) {
+    linalg::map(res,
+                out_vectors,
+                utils::mapping<T>{},
+                make_device_matrix_view<const float>(out_float_ptr, n_rows, index.dim()));
+  }
+}
+
+/**
+ * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is,
+ * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses
+ * pq_dim bytes.
+ */
+struct pass_codes {
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[in] codes the source codes.
+   */
+  __device__ inline pass_codes(device_matrix_view<const uint8_t, uint32_t, row_major> codes)
+    : codes{codes}
+  {
+  }
+
+  /** Read j-th component (code) of the i-th vector from the source. */
+  __device__ inline auto operator()(uint32_t i, uint32_t j) const -> uint8_t { return codes(i, j); }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void pack_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  write_list<PqBits, 1>(
+    list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes});
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
+ *
+ * @param[out] list_data the packed ivf::list data.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void pack_list_data(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return pack_list_data_kernel<kBlockSize, 4>;
+      case 5: return pack_list_data_kernel<kBlockSize, 5>;
+      case 6: return pack_list_data_kernel<kBlockSize, 6>;
+      case 7: return pack_list_data_kernel<kBlockSize, 7>;
+      case 8: return pack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(list_data, codes, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                    uint32_t label,
+                    std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  pack_list_data(index->lists()[label]->data.view(),
+                 new_codes,
+                 offset_or_indices,
+                 index->pq_bits(),
+                 res.get_stream());
+}
+
+/**
+ *
+ * A producer for the `write_list` and `write_vector` that encodes level-1 input vector residuals
+ * into lvl-2 PQ codes.
+ * Computing a PQ code means finding the closest cluster in a pq_dim-subspace.
  *
  * @tparam SubWarpSize
  *   how many threads work on a single vector;
- *   bouded by either WarpSize or pq_book_size.
+ *   bounded by either WarpSize or pq_book_size.
  *
  * @param pq_centers
  *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
@@ -574,56 +870,75 @@ void train_per_cluster(raft::device_resources const& handle,
  * @param j index along pq_dim "dimension"
  * @param cluster_ix is used for PER_CLUSTER codebooks.
  */
-template <uint32_t SubWarpSize>
-__device__ auto compute_pq_code(
-  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
-  device_mdspan<const float, extent_2d<uint32_t>, row_major> new_vector,
-  codebook_gen codebook_kind,
-  uint32_t j,
-  uint32_t cluster_ix) -> uint8_t
-{
-  using subwarp_align = Pow2<SubWarpSize>;
-  uint32_t lane_id    = subwarp_align::mod(laneId());
-  uint32_t partition_ix;
-  switch (codebook_kind) {
-    case codebook_gen::PER_CLUSTER: {
-      partition_ix = cluster_ix;
-    } break;
-    case codebook_gen::PER_SUBSPACE: {
-      partition_ix = j;
-    } break;
-    default: __builtin_unreachable();
+/**
+ */
+template <uint32_t SubWarpSize, typename IdxT>
+struct encode_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<IdxT>, row_major> in_vectors;
+
+  __device__ inline encode_vectors(
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, IdxT, row_major> in_vectors,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_centers{pq_centers},
+      in_vectors{reinterpret_vectors(in_vectors, pq_centers)}
+  {
   }
 
-  const uint32_t pq_book_size = pq_centers.extent(2);
-  const uint32_t pq_len       = pq_centers.extent(1);
-  float min_dist              = std::numeric_limits<float>::infinity();
-  uint8_t code                = 0;
-  // calculate the distance for each PQ cluster, find the minimum for each thread
-  for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) {
-    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
-    float d = 0.0f;
-    for (uint32_t k = 0; k < pq_len; k++) {
-      auto t = new_vector(j, k) - pq_centers(partition_ix, k, i);
-      d += t * t;
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t
+  {
+    uint32_t lane_id = Pow2<SubWarpSize>::mod(laneId());
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
     }
-    if (d < min_dist) {
-      min_dist = d;
-      code     = uint8_t(i);
+
+    const uint32_t pq_book_size = pq_centers.extent(2);
+    const uint32_t pq_len       = pq_centers.extent(1);
+    float min_dist              = std::numeric_limits<float>::infinity();
+    uint8_t code                = 0;
+    // calculate the distance for each PQ cluster, find the minimum for each thread
+    for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) {
+      // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+      float d = 0.0f;
+      for (uint32_t k = 0; k < pq_len; k++) {
+        auto t = in_vectors(i, j, k) - pq_centers(partition_ix, k, l);
+        d += t * t;
+      }
+      if (d < min_dist) {
+        min_dist = d;
+        code     = uint8_t(l);
+      }
     }
-  }
-  // reduce among threads
+    // reduce among threads
 #pragma unroll
-  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
-    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
-    const auto other_code = shfl_xor(code, stride, SubWarpSize);
-    if (other_dist < min_dist) {
-      min_dist = other_dist;
-      code     = other_code;
+    for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+      const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+      const auto other_code = shfl_xor(code, stride, SubWarpSize);
+      if (other_dist < min_dist) {
+        min_dist = other_dist;
+        code     = other_code;
+      }
     }
+    return code;
   }
-  return code;
-}
+};
 
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
 __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
@@ -639,7 +954,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
   using subwarp_align             = Pow2<kSubWarpSize>;
   const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
-  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x});
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
   if (row_ix >= new_vectors.extent(0)) { return; }
 
   const uint32_t cluster_ix = new_labels[row_ix];
@@ -647,7 +962,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
   out_ix = shfl(out_ix, 0, kSubWarpSize);
 
-  // write the label
+  // write the label  (one record per subwarp)
   auto pq_indices = inds_ptrs(cluster_ix);
   if (lane_id == 0) {
     if (std::holds_alternative<IdxT>(src_offset_or_indices)) {
@@ -657,40 +972,81 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
     }
   }
 
-  // write the codes
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(out_ix);
-  const uint32_t ingroup_ix = group_align::mod(out_ix);
-  const uint32_t pq_len     = pq_centers.extent(1);
-  const uint32_t pq_dim     = new_vectors.extent(1) / pq_len;
-
+  // write the codes (one record per subwarp):
+  const uint32_t pq_dim = new_vectors.extent(1) / pq_centers.extent(1);
   auto pq_extents = list_spec<uint32_t, IdxT>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
-  auto pq_extents_vectorized =
-    make_extents<uint32_t>(pq_extents.extent(0), pq_extents.extent(1), pq_extents.extent(2));
-  auto pq_dataset = make_mdspan<pq_vec_t, uint32_t, row_major, false, true>(
-    reinterpret_cast<pq_vec_t*>(data_ptrs[cluster_ix]), pq_extents_vectorized);
-
-  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
-  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
-  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // clear the chunk for writing
-    if (lane_id == 0) { code = pq_vec_t{}; }
-    // fill-in the values, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // find the label
-      using layout_t   = typename decltype(new_vectors)::layout_type;
-      using accessor_t = typename decltype(new_vectors)::accessor_type;
-      auto one_vector  = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
-        &new_vectors(row_ix, 0), extent_2d<uint32_t>{pq_dim, pq_len});
-      auto l = compute_pq_code<kSubWarpSize>(pq_centers, one_vector, codebook_kind, j, cluster_ix);
-      if (lane_id == 0) { out[k] = l; }
+  auto pq_dataset =
+    make_mdspan<uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+  write_vector<PqBits, kSubWarpSize>(
+    pq_dataset,
+    out_ix,
+    row_ix,
+    pq_dim,
+    encode_vectors<kSubWarpSize, IdxT>{pq_centers, new_vectors, codebook_kind, cluster_ix});
+}
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void encode_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const float, uint32_t, row_major> new_vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  const uint32_t pq_dim           = new_vectors.extent(1) / pq_centers.extent(1);
+  auto encode_action =
+    encode_vectors<kSubWarpSize, uint32_t>{pq_centers, new_vectors, codebook_kind, cluster_ix};
+  write_list<PqBits, kSubWarpSize>(
+    list_data, offset_or_indices, new_vectors.extent(0), pq_dim, encode_action);
+}
+
+template <typename T, typename IdxT>
+void encode_list_data(raft::device_resources const& res,
+                      index<IdxT>* index,
+                      device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = new_vectors.extent(0);
+  if (n_rows == 0) { return; }
+
+  auto mr = res.get_workspace_resource();
+
+  auto new_vectors_residual =
+    make_device_mdarray<float>(res, mr, make_extents<uint32_t>(n_rows, index->rot_dim()));
+
+  flat_compute_residuals<T, uint32_t>(res,
+                                      new_vectors_residual.data_handle(),
+                                      n_rows,
+                                      index->rotation_matrix(),
+                                      index->centers(),
+                                      new_vectors.data_handle(),
+                                      label,
+                                      mr);
+
+  constexpr uint32_t kBlockSize  = 256;
+  const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index->pq_book_size());
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return encode_list_data_kernel<kBlockSize, 4>;
+      case 5: return encode_list_data_kernel<kBlockSize, 5>;
+      case 6: return encode_list_data_kernel<kBlockSize, 6>;
+      case 7: return encode_list_data_kernel<kBlockSize, 7>;
+      case 8: return encode_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
     }
-    // write the chunk into the dataset
-    if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; }
-  }
+  }(index->pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(index->lists()[label]->data.view(),
+                                                   new_vectors_residual.view(),
+                                                   index->pq_centers(),
+                                                   index->codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -732,14 +1088,14 @@ void process_and_fill_codes(raft::device_resources const& handle,
   auto new_vectors_residual =
     make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
 
-  flat_compute_residuals(handle,
-                         new_vectors_residual.data_handle(),
-                         n_rows,
-                         index.rotation_matrix(),
-                         index.centers(),
-                         new_vectors,
-                         new_labels,
-                         mr);
+  flat_compute_residuals<T, IdxT>(handle,
+                                  new_vectors_residual.data_handle(),
+                                  n_rows,
+                                  index.rotation_matrix(),
+                                  index.centers(),
+                                  new_vectors,
+                                  new_labels,
+                                  mr);
 
   constexpr uint32_t kBlockSize  = 256;
   const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index.pq_book_size());
@@ -819,6 +1175,85 @@ void recompute_internal_state(const raft::device_resources& res, index<IdxT>& in
   }
 }
 
+/**
+ * Helper function: allocate enough space in the list, compute the offset, at which to start
+ * writing, and fill-in indices.
+ *
+ * @return offset for writing the data
+ */
+template <typename IdxT>
+auto extend_list_prepare(raft::device_resources const& res,
+                         index<IdxT>* index,
+                         device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                         uint32_t label) -> uint32_t
+{
+  uint32_t n_rows = new_indices.extent(0);
+  uint32_t offset;
+  // Allocate the lists to fit the new data
+  copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream());
+  res.sync_stream();
+  uint32_t new_size = offset + n_rows;
+  copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream());
+  auto spec = list_spec<uint32_t, IdxT>{
+    index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
+  auto& list = index->lists()[label];
+  ivf::resize_list(res, list, spec, new_size, offset);
+  copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
+  return offset;
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Pack the data
+  pack_list_data<IdxT>(res, index, new_codes, label, offset);
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification step.
+ * See the public interface for the api and usage.
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Encode the data
+  encode_list_data<T, IdxT>(res, index, new_vectors, label, offset);
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Remove all data from a single list.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  uint32_t zero = 0;
+  copy(index->list_sizes().data_handle() + label, &zero, 1, res.get_stream());
+  index->lists()[label].reset();
+  recompute_internal_state(res, *index);
+}
+
 /** Copy the state of an index into a new index, but share the list data among the two. */
 template <typename IdxT>
 auto clone(const raft::device_resources& res, const index<IdxT>& source) -> index<IdxT>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
new file mode 100644
index 0000000000..52969dd176
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/ivf_list.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <variant>
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/** A chunk of PQ-encoded vector managed by one CUDA thread. */
+using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+
+/**
+ * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
+ *
+ * @tparam Bits number of bits comprising the value.
+ */
+template <uint32_t Bits>
+struct bitfield_ref_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
+  uint8_t* ptr;
+  uint32_t offset;
+
+  constexpr operator uint8_t()  // NOLINT
+  {
+    auto pair = static_cast<uint16_t>(ptr[0]);
+    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
+    return static_cast<uint8_t>((pair >> offset) & kMask);
+  }
+
+  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
+  {
+    if (offset + Bits > 8) {
+      auto pair = static_cast<uint16_t>(ptr[0]);
+      pair |= static_cast<uint16_t>(ptr[1]) << 8;
+      pair &= ~(static_cast<uint16_t>(kMask) << offset);
+      pair |= static_cast<uint16_t>(code) << offset;
+      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
+      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
+    } else {
+      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
+    }
+    return *this;
+  }
+};
+
+/**
+ * View a byte array as an array of unsigned integers of custom small bit size.
+ *
+ * @tparam Bits number of bits comprising a single element of the array.
+ */
+template <uint32_t Bits>
+struct bitfield_view_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  uint8_t* raw;
+
+  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
+  {
+    uint32_t bit_offset = i * Bits;
+    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
+  }
+};
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack)
+ *
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t in_ix,
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // read a piece of the reconstructed vector
+      action(code_view[k], out_ix, j);
+    }
+  }
+}
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
+ * @tparam IdxT type of the index passed to the action
+ * @tparam Action tells how to process a single vector (e.g. encode or just pack)
+ *
+ * @param[in] out_list_data the encoded cluster data.
+ * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads).
+ * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads).
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
+__device__ void write_vector(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  uint32_t out_ix,
+  IdxT in_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
+
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk
+    if (lane_id == 0) { code_chunk = pq_vec_t{}; }
+    // write the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // write a single code
+      uint8_t code = action(in_ix, j);
+      if (lane_id == 0) { code_view[k] = code; }
+    }
+    // write the chunk to the list
+    if (lane_id == 0) {
+      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+    }
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(in_list_data, src_ix, ix, pq_dim, action);
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
+__device__ void write_list(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t stride     = subwarp_align::div(blockDim.x);
+  uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
+  for (; ix < len; ix += stride) {
+    const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    write_vector<PqBits, SubWarpSize>(out_list_data, dst_ix, ix, pq_dim, action);
+  }
+}
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 934643e0af..dfc24e8214 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -234,7 +234,7 @@ auto build(raft::device_resources const& handle,
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
  * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    The new data is clustered according to existing kmeans clusters, the cluster
  *    centers are unchanged.
  *
  * Usage example:
diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
new file mode 100644
index 0000000000..398bd545f1
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_build.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace raft::neighbors::ivf_pq::helpers {
+/**
+ * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index
+ * @{
+ */
+
+namespace codepacker {
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Bit compression is removed, which means output will have pq_dim dimensional vectors (one code per
+ * byte, instead of ceildiv(pq_dim * pq_bits, 8) bytes of pq codes).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.pq_dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_pq::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[out] codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ */
+inline void unpack(
+  raft::device_resources const& res,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_matrix_view<uint8_t, uint32_t, row_major> codes)
+{
+  ivf_pq::detail::unpack_list_data(codes, list_data, offset, pq_bits, res.get_stream());
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[in] list_data block to write into
+ */
+inline void pack(
+  raft::device_resources const& res,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data)
+{
+  ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, res.get_stream());
+}
+}  // namespace codepacker
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * The list is identified by its label.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will write into the 137th cluster
+ *   uint32_t label = 137;
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<const uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[inout] index IVF-PQ index.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] label The id of the list (cluster) into which we write.
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+                    uint32_t label,
+                    uint32_t offset)
+{
+  ivf_pq::detail::pack_list_data(res, index, codes, label, offset);
+}
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
+ *   // unpack the whole list
+ *   ivf_pq::helpers::unpack_list_data(res, index, codes.view(), label, 0);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      uint32_t offset)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
+}
+
+/**
+ * @brief Unpack a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::unpack_list_data(
+ *       res, index, selected_indices.view(), codes.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_vector_view<const uint32_t> in_cluster_indices,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
+}
+
+/**
+ * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           uint32_t offset)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, offset);
+}
+
+/**
+ * @brief Decode a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(
+ *                             res, selected_indices.size(), index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(
+ *       res, index, selected_indices.view(), decoded_vectors.view(), label);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_vector_view<const uint32_t> in_cluster_indices,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will fill 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_codes = raft::make_device_matrix<uint8_t, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.pq_dim());
+ *   ... fill codes ...
+ *   // extend list with new codes
+ *   ivf_pq::helpers::extend_list_with_codes(
+ *       res, &index, codes.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  ivf_pq::detail::extend_list_with_codes(res, index, new_codes, new_indices, label);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification
+ * step.
+ *
+ *  Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will extend with 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_vectors = raft::make_device_matrix<float, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.dim());
+ *   ... fill vectors ...
+ *   // extend list with new vectors
+ *   ivf_pq::helpers::extend_list(
+ *       res, &index, new_vectors.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_vectors data to encode [n_rows, index.dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ *
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  ivf_pq::detail::extend_list(res, index, new_vectors, new_indices, label);
+}
+
+/**
+ * @brief Remove all data from a single list (cluster) in the index.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will erase the fourth cluster (label = 3)
+ *   ivf_pq::helpers::erase_list(res, &index, 3);
+ * @endcode
+ *
+ * @tparam IdxT
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  ivf_pq::detail::erase_list(res, index, label);
+}
+
+/** @} */
+}  // namespace raft::neighbors::ivf_pq::helpers
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index c69829821a..07efcb099e 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -22,7 +22,11 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/map_reduce.cuh>
+#include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
 #include <raft/random/rng.cuh>
 #ifdef RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
@@ -38,8 +42,6 @@
 #include <gtest/gtest.h>
 
 #include <cub/cub.cuh>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -115,6 +117,33 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   return os;
 }
 
+template <typename T>
+void compare_vectors_l2(
+  const raft::device_resources& res, T a, T b, uint32_t label, double compression_ratio, double eps)
+{
+  auto n_rows = a.extent(0);
+  auto dim    = a.extent(1);
+  rmm::mr::managed_memory_resource managed_memory;
+  auto dist = make_device_mdarray<double>(res, &managed_memory, make_extents<uint32_t>(n_rows));
+  linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) {
+    spatial::knn::detail::utils::mapping<float> f{};
+    double d = 0.0f;
+    for (uint32_t j = 0; j < dim; j++) {
+      double t = f(a(i, j)) - f(b(i, j));
+      d += t * t;
+    }
+    return sqrt(d / double(dim));
+  });
+  res.sync_stream();
+  for (uint32_t i = 0; i < n_rows; i++) {
+    double d = dist(i);
+    // The theoretical estimate of the error is hard to come up with,
+    // the estimate below is based on experimentation + curse of dimensionality
+    ASSERT_LE(d, 1.2 * eps * std::pow(2.0, compression_ratio))
+      << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")";
+  }
+}
+
 template <typename IdxT>
 auto min_output_size(const raft::device_resources& handle,
                      const ivf_pq::index<IdxT>& index,
@@ -139,7 +168,6 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   {
   }
 
- protected:
   void gen_data()
   {
     database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
@@ -178,7 +206,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     handle_.sync_stream(stream_);
   }
 
-  index<IdxT> build_only()
+  auto build_only()
   {
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = true;
@@ -188,19 +216,17 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return ivf_pq::build<DataT, IdxT>(handle_, ipams, index_view);
   }
 
-  index<IdxT> build_2_extends()
+  auto build_2_extends()
   {
-    rmm::device_uvector<IdxT> db_indices(ps.num_db_vecs, stream_);
-    thrust::sequence(handle_.get_thrust_policy(),
-                     thrust::device_pointer_cast(db_indices.data()),
-                     thrust::device_pointer_cast(db_indices.data() + ps.num_db_vecs));
+    auto db_indices = make_device_vector<IdxT>(handle_, ps.num_db_vecs);
+    linalg::map_offset(handle_, db_indices.view(), identity_op{});
     handle_.sync_stream(stream_);
     auto size_1 = IdxT(ps.num_db_vecs) / 2;
     auto size_2 = IdxT(ps.num_db_vecs) - size_1;
     auto vecs_1 = database.data();
     auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim);
-    auto inds_1 = db_indices.data();
-    auto inds_2 = db_indices.data() + size_t(size_1);
+    auto inds_1 = db_indices.data_handle();
+    auto inds_2 = db_indices.data_handle() + size_t(size_1);
 
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = false;
@@ -220,17 +246,160 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return idx;
   }
 
-  index<IdxT> build_serialize()
+  auto build_serialize()
   {
     ivf_pq::serialize<IdxT>(handle_, "ivf_pq_index", build_only());
     return ivf_pq::deserialize<IdxT>(handle_, "ivf_pq_index");
   }
 
+  void check_reconstruction(const index<IdxT>& index,
+                            double compression_ratio,
+                            uint32_t label,
+                            uint32_t n_take,
+                            uint32_t n_skip)
+  {
+    auto& rec_list = index.lists()[label];
+    auto dim       = index.dim();
+    n_take         = std::min<uint32_t>(n_take, rec_list->size.load());
+    n_skip         = std::min<uint32_t>(n_skip, rec_list->size.load() - n_take);
+
+    if (n_take == 0) { return; }
+
+    auto rec_data  = make_device_matrix<DataT>(handle_, n_take, dim);
+    auto orig_data = make_device_matrix<DataT>(handle_, n_take, dim);
+
+    ivf_pq::helpers::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
+
+    matrix::gather(database.data(),
+                   IdxT{dim},
+                   IdxT{n_take},
+                   rec_list->indices.data_handle() + n_skip,
+                   IdxT{n_take},
+                   orig_data.data_handle(),
+                   stream_);
+
+    compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.06);
+  }
+
+  void check_reconstruct_extend(index<IdxT>* index, double compression_ratio, uint32_t label)
+  {
+    // NB: this is not reference, the list is retained; the index will have to create a new list on
+    // `erase_list` op.
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+    if (n_rows == 0) { return; }
+
+    auto vectors_1 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    auto indices   = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
+    // breaks type inference
+    ivf_pq::helpers::extend_list<EvalT, IdxT>(
+      handle_, index, vectors_1.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+
+    auto vectors_2 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0);
+    // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes.
+    // Hence, encoding-decoding chain often leads to altering both the PQ codes and the
+    // reconstructed data.
+    compare_vectors_l2(
+      handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.025);
+  }
+
+  void check_packing(index<IdxT>* index, uint32_t label)
+  {
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+
+    if (n_rows == 0) { return; }
+
+    auto codes   = make_device_matrix<uint8_t>(handle_, n_rows, index->pq_dim());
+    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::helpers::unpack_list_data(handle_, *index, codes.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    ivf_pq::helpers::extend_list_with_codes<IdxT>(
+      handle_, index, codes.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+    auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
+                          new_list->data.extent(2) * new_list->data.extent(3);
+
+    ASSERT_TRUE(old_list->data.size() >= list_data_size);
+    ASSERT_TRUE(new_list->data.size() >= list_data_size);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Pack a few vectors back to the list.
+    int row_offset = 9;
+    int n_vec      = 3;
+    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    size_t offset      = row_offset * index->pq_dim();
+    auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
+      codes.data_handle() + offset, n_vec, index->pq_dim());
+    ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Another test with the API that take list_data directly
+    auto list_data  = index->lists()[label]->data.view();
+    uint32_t n_take = 4;
+    ASSERT_TRUE(row_offset + n_take < n_rows);
+    auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
+    ivf_pq::helpers::codepacker::unpack(
+      handle_, list_data, index->pq_bits(), row_offset, codes2.view());
+
+    // Write it back
+    ivf_pq::helpers::codepacker::pack(
+      handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+  }
+
   template <typename BuildIndex>
   void run(BuildIndex build_index)
   {
     index<IdxT> index = build_index();
 
+    double compression_ratio =
+      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
+
+    for (uint32_t label = 0; label < index.n_lists(); label++) {
+      switch (label % 3) {
+        case 0: {
+          // Reconstruct and re-write vectors for one label
+          check_reconstruct_extend(&index, compression_ratio, label);
+        } break;
+        case 1: {
+          // Dump and re-write codes for one label
+          check_packing(&index, label);
+        } break;
+        default: {
+          // check a small subset of data in a randomly chosen cluster to see if the data
+          // reconstruction works well.
+          check_reconstruction(index, compression_ratio, label, 100, 7);
+        }
+      }
+    }
+
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<IdxT> indices_ivf_pq(queries_size);
     std::vector<EvalT> distances_ivf_pq(queries_size);
@@ -255,11 +424,9 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // A very conservative lower bound on recall
     double min_recall =
       static_cast<double>(ps.search_params.n_probes) / static_cast<double>(ps.index_params.n_lists);
-    double low_precision_factor =
-      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
     // Using a heuristic to lower the required recall due to code-packing errors
     min_recall =
-      std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall);
+      std::min(std::erfc(0.05 * compression_ratio / std::max(min_recall, 0.5)), min_recall);
     // Use explicit per-test min recall value if provided.
     min_recall = ps.min_recall.value_or(min_recall);
 
@@ -269,7 +436,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 distances_ivf_pq,
                                 ps.num_queries,
                                 ps.k,
-                                0.0001 * low_precision_factor,
+                                0.0001 * compression_ratio,
                                 min_recall))
       << ps;