rapidsai · raydouglass · Aug 1, 2023 · Jul 1, 2023 · Jul 1, 2023 · Jul 5, 2023
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -57,4 +57,4 @@ dependencies:
 - ucx-proc=*=gpu
 - ucx-py==0.33.*
 - ucx>=1.13.0
-name: all_cuda-118_arch-x86_64
+name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -53,4 +53,4 @@ dependencies:
 - ucx-proc=*=gpu
 - ucx-py==0.33.*
 - ucx>=1.13.0
-name: all_cuda-120_arch-x86_64
+name: all_cuda-120_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -35,4 +35,4 @@ dependencies:
 - nlohmann_json>=3.11.2
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
-name: bench_ann_cuda-118_arch-x86_64
+name: bench_ann_cuda-118_arch-x86_64
@@ -26,11 +26,13 @@
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/neighbors/ivf_flat_codepacker.cuh>
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_list_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/stats/histogram.cuh>
+#include <raft/util/fast_int_div.cuh>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -416,4 +418,63 @@ inline void fill_refinement_index(raft::resources const& handle,
                                          refinement_index->veclen());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
+
+template <typename T>
+__global__ void pack_interleaved_list_kernel(
+  const T* codes, T* list_data, uint32_t n_rows, uint32_t dim, uint32_t veclen)
+{
+  uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n_rows) {
+    codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid);
+  }
+}
+
+template <typename T>
+__global__ void unpack_interleaved_list_kernel(
+  const T* list_data, T* codes, uint32_t n_rows, uint32_t dim, uint32_t veclen)
+{
+  uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n_rows) {
+    codepacker::unpack_1_interleaved(list_data, codes + tid * dim, dim, veclen, tid);
+  }
+}
+
+template <typename T, typename IdxT>
+void pack_list_data(
+raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
+  uint32_t veclen,
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data)
+{
+  uint32_t n_rows                      = codes.extent(0);
+  uint32_t dim                         = codes.extent(1);
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = resource::get_cuda_stream(res);
+  pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T, typename IdxT>
+void unpack_list_data(
+  raft::resources const& res,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data,
+  uint32_t veclen,
+  device_matrix_view<T, uint32_t, row_major> codes)
+{
+  uint32_t n_rows                      = codes.extent(0);
+  uint32_t dim                         = codes.extent(1);
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = resource::get_cuda_stream(res);
+  unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+namespace raft::neighbors::ivf_flat::codepacker {
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record
+ * in the list. This function interleaves the code and is intended to later copy the interleaved
+ * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit
+ * the record (offset + 1).
+ *
+ * @tparam T
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename T>
+__host__ __device__ void pack_1_interleaved(
+  const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
+    }
+  }
+}
+
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset
+ * indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @tparam T
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in
+ * interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
+template <typename T>
+__host__ __device__ void unpack_1_interleaved(
+  const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
+    }
+  }
+}
+}  // namespace raft::neighbors::ivf_flat::codepacker
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::neighbors::ivf_flat::helpers {
+/**
+ * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index
+ * @{
+ */
+
+namespace codepacker {
+
+template <typename T, typename IdxT>
+void pack_full_list(
+  raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
+  uint32_t veclen,
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data)
+{
+  raft::neighbors::ivf_flat::detail::pack_list_data<T, IdxT>(res, codes, veclen, list_data);
+}
+
+template <typename T, typename IdxT>
+void unpack_full_list(
+  raft::resources const& res,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data,
+  uint32_t veclen,
+  device_matrix_view<T, uint32_t, row_major> codes)
+{
+  raft::neighbors::ivf_flat::detail::unpack_list_data<T, IdxT>(res, list_data, veclen, codes);
+}
+}  // namespace codepacker
+/** @} */
+}  // namespace raft::neighbors::ivf_flat::helpers
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -113,6 +113,7 @@ struct list_spec {
   /** Determine the extents of an array enough to hold a given amount of data. */
   constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
   {
+    // return make_extents<SizeT>(round_up_safe(n_rows, this->align_min), dim);
     return make_extents<SizeT>(n_rows, dim);
   }
 };

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -18,6 +18,7 @@
 
 #include "../ann_common.h"
 #include "../ivf_flat.cuh"
+#include <cstring>
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include "processing.cuh"