diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 7c2fa05bfe..9cde1143e0 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -26,6 +26,7 @@
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/neighbors/ivf_flat_codepacker.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_list_types.hpp>
@@ -416,4 +417,77 @@ inline void fill_refinement_index(raft::resources const& handle,
                                          refinement_index->veclen());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
+
+template <typename T>
+__global__ void pack_interleaved_list_kernel(
+  const T* codes,
+  T* list_data,
+  uint32_t n_rows,
+  uint32_t dim,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                            ? std::get<uint32_t>(offset_or_indices) + tid
+                            : std::get<const uint32_t*>(offset_or_indices)[tid];
+  if (tid < n_rows) { codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, dst_ix); }
+}
+
+template <typename T>
+__global__ void unpack_interleaved_list_kernel(
+  const T* list_data,
+  T* codes,
+  uint32_t n_rows,
+  uint32_t dim,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                            ? std::get<uint32_t>(offset_or_indices) + tid
+                            : std::get<const uint32_t*>(offset_or_indices)[tid];
+  if (tid < n_rows) { codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, src_ix); }
+}
+
+template <typename T, typename IdxT>
+void pack_list_data(
+  raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data)
+{
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim    = codes.extent(1);
+  if (n_rows == 0 || dim == 0) return;
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = resource::get_cuda_stream(res);
+  pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T, typename IdxT>
+void unpack_list_data(
+  raft::resources const& res,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  device_matrix_view<T, uint32_t, row_major> codes)
+{
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim    = codes.extent(1);
+  if (n_rows == 0 || dim == 0) return;
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = resource::get_cuda_stream(res);
+  unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
new file mode 100644
index 0000000000..4594332fdf
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/neighbors/ivf_flat_types.hpp>
+
+#ifdef _RAFT_HAS_CUDA
+#include <raft/util/pow2_utils.cuh>
+#else
+#include <raft/util/integer_utils.hpp>
+#endif
+
+namespace raft::neighbors::ivf_flat::codepacker {
+
+template <typename T>
+_RAFT_HOST_DEVICE inline auto roundDown(T x)
+{
+#if defined(_RAFT_HAS_CUDA)
+  return Pow2<kIndexGroupSize>::roundDown(x);
+#else
+  return raft::round_down_safe(x, kIndexGroupSize);
+#endif
+}
+
+template <typename T>
+_RAFT_HOST_DEVICE inline auto mod(T x)
+{
+#if defined(_RAFT_HAS_CUDA)
+  return Pow2<kIndexGroupSize>::mod(x);
+#else
+  return x % kIndexGroupSize;
+#endif
+}
+
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record
+ * in the list. This function interleaves the code and is intended to later copy the interleaved
+ * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit
+ * the record (offset + 1).
+ *
+ * @tparam T
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename T>
+_RAFT_HOST_DEVICE void pack_1(
+  const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  // using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = roundDown(offset);
+  auto ingroup_id   = mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
+    }
+  }
+}
+
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset
+ * indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @tparam T
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in
+ * interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
+template <typename T>
+_RAFT_HOST_DEVICE void unpack_1(
+  const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  // using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = roundDown(offset);
+  auto ingroup_id   = mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
+    }
+  }
+}
+}  // namespace raft::neighbors::ivf_flat::codepacker
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
new file mode 100644
index 0000000000..096e8051c3
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::neighbors::ivf_flat::helpers {
+/**
+ * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index
+ * @{
+ */
+
+namespace codepacker {
+
+/**
+ * Write flat codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<T>(res, n_vec, index.dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data);
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[in] codes flat codes [n_vec, dim]
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[inout] list_data block to write into
+ */
+template <typename T, typename IdxT>
+void pack(
+  raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
+  uint32_t veclen,
+  uint32_t offset,
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data)
+{
+  raft::neighbors::ivf_flat::detail::pack_list_data<T, IdxT>(res, codes, veclen, offset, list_data);
+}
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<T>(res, n_take, index.dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_pq::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[inout] codes
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be <= the list size.
+ */
+template <typename T, typename IdxT>
+void unpack(
+  raft::resources const& res,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
+  uint32_t veclen,
+  uint32_t offset,
+  device_matrix_view<T, uint32_t, row_major> codes)
+{
+  raft::neighbors::ivf_flat::detail::unpack_list_data<T, IdxT>(
+    res, list_data, veclen, offset, codes);
+}
+}  // namespace codepacker
+/** @} */
+}  // namespace raft::neighbors::ivf_flat::helpers
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index a252b26600..d72d73680a 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -17,15 +17,27 @@
 
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_list.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/fast_int_div.cuh>
+#include <thrust/functional.h>
 
 #include <raft_internal/neighbors/naive_knn.cuh>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_helpers.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -36,6 +48,7 @@
 
 #include <gtest/gtest.h>
 
+#include <rmm/device_uvector.hpp>
 #include <thrust/sequence.h>
 
 #include <cstddef>
@@ -76,7 +89,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   {
   }
 
- protected:
   void testIVFFlat()
   {
     size_t queries_size = ps.num_queries * ps.k;
@@ -264,6 +276,136 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     }
   }
 
+  void testPacker()
+  {
+    ivf_flat::index_params index_params;
+    ivf_flat::search_params search_params;
+    index_params.n_lists          = ps.nlist;
+    index_params.metric           = ps.metric;
+    index_params.adaptive_centers = false;
+    search_params.n_probes        = ps.nprobe;
+
+    index_params.add_data_on_build        = false;
+    index_params.kmeans_trainset_fraction = 1.0;
+    index_params.metric_arg               = 0;
+
+    auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+      (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+
+    auto idx = ivf_flat::build(handle_, index_params, database_view);
+
+    const std::optional<raft::device_vector_view<const IdxT, IdxT>> no_opt = std::nullopt;
+    index<DataT, IdxT> extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx);
+
+    auto list_sizes = raft::make_host_vector<uint32_t>(idx.n_lists());
+    update_host(list_sizes.data_handle(),
+                extend_index.list_sizes().data_handle(),
+                extend_index.n_lists(),
+                stream_);
+    resource::sync_stream(handle_);
+
+    auto& lists = idx.lists();
+
+    // conservative memory allocation for codepacking
+    auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{idx.dim(), false};
+
+    for (uint32_t label = 0; label < idx.n_lists(); label++) {
+      uint32_t list_size = list_sizes.data_handle()[label];
+
+      ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0);
+    }
+
+    idx.recompute_internal_state(handle_);
+
+    using interleaved_group = Pow2<kIndexGroupSize>;
+
+    for (uint32_t label = 0; label < idx.n_lists(); label++) {
+      uint32_t list_size = list_sizes.data_handle()[label];
+
+      if (list_size > 0) {
+        uint32_t padded_list_size = interleaved_group::roundUp(list_size);
+        uint32_t n_elems          = padded_list_size * idx.dim();
+        auto list_data            = lists[label]->data;
+        auto list_inds            = extend_index.lists()[label]->indices;
+
+        // fetch the flat codes
+        auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+
+        matrix::gather(
+          handle_,
+          make_device_matrix_view<const DataT, uint32_t>(
+            (const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), idx.dim()),
+          make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(),
+                                                        list_size),
+          flat_codes.view());
+
+        helpers::codepacker::pack<DataT, IdxT>(
+          handle_, make_const_mdspan(flat_codes.view()), idx.veclen(), 0, list_data.view());
+
+        {
+          auto mask = make_device_vector<bool>(handle_, n_elems);
+
+          linalg::map_offset(handle_,
+                             mask.view(),
+                             [dim = idx.dim(),
+                              list_size,
+                              padded_list_size,
+                              chunk_size = util::FastIntDiv(idx.veclen())] __device__(auto i) {
+                               uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+                               if (i < max_group_offset * dim) { return true; }
+                               uint32_t surplus    = (i - max_group_offset * dim);
+                               uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size);
+                               return ingroup_id < (list_size - max_group_offset);
+                             });
+
+          // ensure that the correct number of indices are masked out
+          ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_),
+                                     mask.data_handle(),
+                                     mask.data_handle() + n_elems,
+                                     0) == list_size * ps.dim);
+
+          auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+
+          linalg::map_offset(handle_,
+                             packed_list_data.view(),
+                             [mask      = mask.data_handle(),
+                              list_data = list_data.data_handle()] __device__(uint32_t i) {
+                               if (mask[i]) return list_data[i];
+                               return DataT{0};
+                             });
+
+          auto extend_data          = extend_index.lists()[label]->data;
+          auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+          linalg::map_offset(handle_,
+                             extend_data_filtered.view(),
+                             [mask        = mask.data_handle(),
+                              extend_data = extend_data.data_handle()] __device__(uint32_t i) {
+                               if (mask[i]) return extend_data[i];
+                               return DataT{0};
+                             });
+
+          ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(),
+                                        extend_data_filtered.data_handle(),
+                                        n_elems,
+                                        raft::Compare<DataT>(),
+                                        stream_));
+        }
+
+        auto unpacked_flat_codes =
+          make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+
+        helpers::codepacker::unpack<DataT, IdxT>(
+          handle_, list_data.view(), idx.veclen(), 0, unpacked_flat_codes.view());
+
+        ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
+                                      unpacked_flat_codes.data_handle(),
+                                      list_size * ps.dim,
+                                      raft::Compare<DataT>(),
+                                      stream_));
+      }
+    }
+  }
+
   void SetUp() override
   {
     database.resize(ps.num_db_vecs * ps.dim, stream_);
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index f0988ca988..3bfea283e5 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -21,7 +21,11 @@
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
-TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat)
+{
+  this->testIVFFlat();
+  this->testPacker();
+}
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));