Implement unpack_list_data

rapidsai · rapids-bot · Apr 17, 2023 · Feb 23, 2023 · Feb 23, 2023 · Feb 23, 2023
commit 3a2c62235861e1e6aa7a1c64c4fa9e1e4036ae99
@@ -557,6 +557,150 @@ void train_per_cluster(raft::device_resources const& handle,
   transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack)
+ *
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t in_ix,
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // read a piece of the reconstructed vector
+      action(code_view[k], out_ix, j);
+    }
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+                            device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
+                            std::variant<uint32_t, const uint32_t*> offset_or_indices,
+                            uint32_t len,
+                            uint32_t cluster_ix,
+                            uint32_t pq_dim,
+                            Action action)
+{
+  auto pq_extents =
+    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
+  auto pq_dataset =
+    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(pq_dataset, src_ix, ix, pq_dim, action);
+  }
+}
+
+/**
+ * A consumer for the `run_on_list` and `run_on_vec` that just flattens PQ codes
+ * one-per-byte. That is, independent of the code width (pq_bits), one code uses
+ * the whole byte, hence one vectors uses pq_dim bytes.
+ */
+struct unpack_codes {
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_codes the destination for the read codes.
+   */
+  __device__ inline unpack_codes(device_matrix_view<uint8_t, uint32_t, row_major> out_codes)
+    : out_codes{out_codes}
+  {
+  }
+
+  /**  Write j-th component (code) of the i-th vector into the output array. */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    out_codes(i, j) = code;
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+  device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+  device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_codes.extent(1);
+  auto unpack_action    = unpack_codes{out_codes};
+  run_on_list<PqBits>(data_ptrs,
+                      list_sizes,
+                      offset_or_indices,
+                      out_codes.extent(0),
+                      cluster_ix,
+                      pq_dim,
+                      unpack_action);
+}
+
+/** Decode the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = out_codes.extent(0);
+  if (n_rows == 0) { return; }
+  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
+    auto n_skip = std::get<uint32_t>(offset_or_indices);
+    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+    // to avoid an extra device-host data copy and the stream sync.
+    RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
+                 "offset + output size must be not bigger than the cluster size.");
+  }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return unpack_list_data_kernel<kBlockSize, 4>;
+      case 5: return unpack_list_data_kernel<kBlockSize, 5>;
+      case 6: return unpack_list_data_kernel<kBlockSize, 6>;
+      case 7: return unpack_list_data_kernel<kBlockSize, 7>;
+      case 8: return unpack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(
+    out_codes, index.data_ptrs(), index.list_sizes(), label, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** A consumer for the `run_on_list` and `run_on_vec` that approximates the original input data. */
 struct reconstruct_vectors {
   codebook_gen codebook_kind;
   uint32_t cluster_ix;
@@ -566,7 +710,7 @@ struct reconstruct_vectors {
   device_mdspan<float, extent_3d<uint32_t>, row_major> out_vectors;
 
   /**
-   * Create the functor to be passed to `run_on_list`.
+   * Create a callable to be passed to `run_on_list`.
    *
    * @param[out] out_vectors the destination for the decoded vectors.
    * @param[in] pq_centers the codebook
@@ -626,69 +770,6 @@ struct reconstruct_vectors {
   }
 };
 
-/**
- * Process a single vector in a list.
- *
- * @tparam PqBits
- * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack)
- *
- * @param[in] in_list_data the encoded cluster data.
- * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
- * @param[in] out_ix the output index passed to the action
- * @param[in] pq_dim
- * @param action a callable action to be invoked on each PQ code (component of the encoding)
- *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
- */
-template <uint32_t PqBits, typename Action>
-__device__ void run_on_vector(
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
-  uint32_t in_ix,
-  uint32_t out_ix,
-  uint32_t pq_dim,
-  Action action)
-{
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(in_ix);
-  const uint32_t ingroup_ix = group_align::mod(in_ix);
-
-  pq_vec_t code_chunk;
-  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // read the chunk
-    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
-    // read the codes, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // read a piece of the reconstructed vector
-      action(code_view[k], out_ix, j);
-    }
-  }
-}
-
-/** Process the given indices or a block of a single list (cluster). */
-template <uint32_t PqBits, typename Action>
-__device__ void run_on_list(device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
-                            device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
-                            std::variant<uint32_t, const uint32_t*> offset_or_indices,
-                            uint32_t len,
-                            uint32_t cluster_ix,
-                            uint32_t pq_dim,
-                            Action action)
-{
-  auto pq_extents =
-    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
-  auto pq_dataset =
-    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
-
-  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
-    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
-                              ? std::get<uint32_t>(offset_or_indices) + ix
-                              : std::get<const uint32_t*>(offset_or_indices)[ix];
-    run_on_vector<PqBits>(pq_dataset, src_ix, ix, pq_dim, action);
-  }
-}
-
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   device_matrix_view<float, uint32_t, row_major> out_vectors,