Skip to content

Commit

Permalink
ivf-pq performance tweaks (#926)
Browse files Browse the repository at this point in the history
A few optimizations to the `ivfpq_compute_similarity_kernel`:

  - Overhauled the way shmem/L1 carveout is selected
  - Introduced the block size selection logic based on the shmem/L1 split, occupancy, and the estimated cluster probes co-residency
  - Ported a new warp-sort module (`warp_sort_distributed`)
  - Transposed `pq_centers` to make loads coalesced
  - Changed layout of `pq_dataset` to make loads coalesced and vectorized
  - Optimized the loops to minimize ALU load

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #926
  • Loading branch information
achirkin authored Nov 17, 2022
1 parent e14bcbd commit e06b156
Show file tree
Hide file tree
Showing 17 changed files with 1,402 additions and 614 deletions.
10 changes: 5 additions & 5 deletions cpp/include/raft/neighbors/ivf_pq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ namespace raft::neighbors::ivf_pq {
*
* @param handle
* @param params configure the index building
* @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
* @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
* @param n_rows the number of samples
* @param dim the dimensionality of the data
*
Expand Down Expand Up @@ -91,8 +91,8 @@ inline auto build(
*
* @param handle
* @param orig_index original index
* @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device pointer to a vector of indices [n_rows].
* @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
* If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
* here to imply a continuous range `[0...n_rows)`.
* @param n_rows the number of samples
Expand All @@ -118,8 +118,8 @@ inline auto extend(const handle_t& handle,
*
* @param handle
* @param[inout] index
* @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device pointer to a vector of indices [n_rows].
* @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
* If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
* here to imply a continuous range `[0...n_rows)`.
* @param n_rows the number of samples
Expand Down
110 changes: 88 additions & 22 deletions cpp/include/raft/neighbors/ivf_pq_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <raft/distance/distance_types.hpp>
#include <raft/util/integer_utils.hpp>

#include <thrust/fill.h>

#include <type_traits>

namespace raft::neighbors::ivf_pq {
Expand Down Expand Up @@ -108,17 +110,30 @@ struct search_params : ann::search_params {
*/
cudaDataType_t internal_distance_dtype = CUDA_R_32F;
/**
* Thread block size of the distance calculation kernel at search time.
* When zero, an optimal block size is selected using a heuristic.
* Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
*
* Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
*
* One wants to increase the carveout to make sure a good GPU occupancy for the main search
* kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
* value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
* configurations, so the provided value is rounded up to the nearest configuration. Refer to the
* NVIDIA tuning guide for the target GPU architecture.
*
* Possible values: [0, 256, 512, 1024]
* Note, this is a low-level tuning parameter that can have drastic negative effects on the search
* performance if tweaked incorrectly.
*/
uint32_t preferred_thread_block_size = 0;
double preferred_shmem_carveout = 1.0;
};

static_assert(std::is_aggregate_v<index_params>);
static_assert(std::is_aggregate_v<search_params>);

/** Size of the interleaved group. */
constexpr static uint32_t kIndexGroupSize = 32;
/** Stride of the interleaved group for vectorized loads. */
constexpr static uint32_t kIndexGroupVecLen = 16;

/**
* @brief IVF-PQ index.
*
Expand Down Expand Up @@ -170,6 +185,19 @@ struct index : ann::index {
"IdxT must be able to represent all values of uint32_t");

public:
/**
* Default value filled in the `indices()` array.
* One may encounter it trying to access a record within a cluster that is outside of the
* `list_sizes()` bound (due to the record alignment `kIndexGroupSize`).
*/
constexpr static IdxT kInvalidRecord = std::numeric_limits<IdxT>::max() - 1;
/**
* Default value returned by `search` when the `n_probes` is too small and top-k is too large.
* One may encounter it if the combined size of probed clusters is smaller than the requested
* number of results per query.
*/
constexpr static IdxT kOutOfBoundsRecord = std::numeric_limits<IdxT>::max();

/** Total length of the index. */
[[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
/** Dimensionality of the input data. */
Expand Down Expand Up @@ -247,12 +275,12 @@ struct index : ann::index {
pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim),
n_nonempty_lists_(n_nonempty_lists),
pq_centers_{make_device_mdarray<float>(handle, make_pq_centers_extents())},
pq_dataset_{make_device_mdarray<uint8_t>(
handle, make_extents<IdxT>(0, this->pq_dim() * this->pq_bits() / 8))},
pq_dataset_{make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(0))},
indices_{make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))},
rotation_matrix_{
make_device_mdarray<float>(handle, make_extents<uint32_t>(this->rot_dim(), this->dim()))},
list_offsets_{make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(this->n_lists() + 1))},
list_sizes_{make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(this->n_lists()))},
centers_{make_device_mdarray<float>(
handle, make_extents<uint32_t>(this->n_lists(), this->dim_ext()))},
centers_rot_{make_device_mdarray<float>(
Expand Down Expand Up @@ -283,35 +311,49 @@ struct index : ann::index {
*/
void allocate(const handle_t& handle, IdxT index_size)
{
pq_dataset_ =
make_device_mdarray<uint8_t>(handle, make_extents<IdxT>(index_size, pq_dataset_.extent(1)));
indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
if (index_size > 0) {
thrust::fill_n(
handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord);
}
check_consistency();
}

using pq_centers_extents =
std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent>;
/**
* PQ cluster centers
*
* - codebook_gen::PER_SUBSPACE: [pq_dim , pq_book_size, pq_len]
* - codebook_gen::PER_CLUSTER: [n_lists, pq_book_size, pq_len]
* - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
* - codebook_gen::PER_CLUSTER: [n_lists, pq_len, pq_book_size]
*/
inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
inline auto pq_centers() noexcept -> device_mdspan<float, pq_centers_extents, row_major>
{
return pq_centers_.view();
}
[[nodiscard]] inline auto pq_centers() const noexcept
-> device_mdspan<const float, extent_3d<uint32_t>, row_major>
-> device_mdspan<const float, pq_centers_extents, row_major>
{
return pq_centers_.view();
}

/** PQ-encoded data [size, pq_dim * pq_bits / 8]. */
inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, extent_2d<IdxT>, row_major>
using pq_dataset_extents = std::experimental::
extents<IdxT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
/** PQ-encoded data stored in the interleaved format:
*
* [ ceildiv(size, kIndexGroupSize)
* , ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits)
* , kIndexGroupSize
* , kIndexGroupVecLen
* ].
*/
inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, pq_dataset_extents, row_major>
{
return pq_dataset_.view();
}
[[nodiscard]] inline auto pq_dataset() const noexcept
-> device_mdspan<const uint8_t, extent_2d<IdxT>, row_major>
-> device_mdspan<const uint8_t, pq_dataset_extents, row_major>
{
return pq_dataset_.view();
}
Expand Down Expand Up @@ -352,6 +394,17 @@ struct index : ann::index {
return list_offsets_.view();
}

/** Sizes of the lists [n_lists]. */
inline auto list_sizes() noexcept -> device_mdspan<uint32_t, extent_1d<uint32_t>, row_major>
{
return list_sizes_.view();
}
[[nodiscard]] inline auto list_sizes() const noexcept
-> device_mdspan<const uint32_t, extent_1d<uint32_t>, row_major>
{
return list_sizes_.view();
}

/** Cluster centers corresponding to the lists in the original space [n_lists, dim_ext] */
inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
{
Expand All @@ -374,6 +427,18 @@ struct index : ann::index {
return centers_rot_.view();
}

/** A helper function to determine the extents of an array enough to hold a given amount of data.
*/
auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
{
// how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
return make_extents<IdxT>(raft::div_rounding_up_safe<IdxT>(n_rows, kIndexGroupSize),
raft::div_rounding_up_safe<IdxT>(pq_dim(), pq_chunk),
kIndexGroupSize,
kIndexGroupVecLen);
}

private:
raft::distance::DistanceType metric_;
codebook_gen codebook_kind_;
Expand All @@ -383,11 +448,12 @@ struct index : ann::index {
uint32_t pq_dim_;
uint32_t n_nonempty_lists_;

device_mdarray<float, extent_3d<uint32_t>, row_major> pq_centers_;
device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
device_mdarray<float, pq_centers_extents, row_major> pq_centers_;
device_mdarray<uint8_t, pq_dataset_extents, row_major> pq_dataset_;
device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
device_mdarray<float, extent_2d<uint32_t>, row_major> centers_rot_;

Expand All @@ -404,13 +470,13 @@ struct index : ann::index {
pq_bits() * pq_dim());
}

auto make_pq_centers_extents() -> extent_3d<uint32_t>
auto make_pq_centers_extents() -> pq_centers_extents
{
switch (codebook_kind()) {
case codebook_gen::PER_SUBSPACE:
return make_extents<uint32_t>(pq_dim(), pq_book_size(), pq_len());
return make_extents<uint32_t>(pq_dim(), pq_len(), pq_book_size());
case codebook_gen::PER_CLUSTER:
return make_extents<uint32_t>(n_lists(), pq_book_size(), pq_len());
return make_extents<uint32_t>(n_lists(), pq_len(), pq_book_size());
default: RAFT_FAIL("Unreachable code");
}
}
Expand All @@ -420,7 +486,7 @@ struct index : ann::index {
// If the dimensionality is large enough, we can reduce it to improve performance
if (dim >= 128) { dim /= 2; }
// Round it down to 32 to improve performance.
uint32_t r = raft::round_down_safe<uint32_t>(dim, 32);
auto r = raft::round_down_safe<uint32_t>(dim, 32);
if (r > 0) return r;
// If the dimensionality is really low, round it to the closest power-of-two
r = 1;
Expand Down
57 changes: 55 additions & 2 deletions cpp/include/raft/spatial/knn/detail/ann_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ struct pointer_residency_count<Type, Types...> {
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
switch (attr.type) {
case cudaMemoryTypeUnregistered:
case cudaMemoryTypeHost: return std::make_tuple(on_device, on_host + 1);
case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1);
case cudaMemoryTypeHost:
return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1);
case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
default: return std::make_tuple(on_device, on_host);
Expand All @@ -75,6 +76,58 @@ auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
return pointer_residency::mixed;
}

/** RAII helper to access the host data from gpu when necessary. */
template <typename PtrT, typename Action>
struct with_mapped_memory_t {
with_mapped_memory_t(PtrT ptr, size_t size, Action action) : action_(action)
{
if (ptr == nullptr) { return; }
switch (utils::check_pointer_residency(ptr)) {
case utils::pointer_residency::device_only:
case utils::pointer_residency::host_and_device: {
dev_ptr_ = (void*)ptr; // NOLINT
} break;
default: {
host_ptr_ = (void*)ptr; // NOLINT
RAFT_CUDA_TRY(cudaHostRegister(host_ptr_, size, choose_flags(ptr)));
RAFT_CUDA_TRY(cudaHostGetDevicePointer(&dev_ptr_, host_ptr_, 0));
} break;
}
}

~with_mapped_memory_t()
{
if (host_ptr_ != nullptr) { cudaHostUnregister(host_ptr_); }
}

auto operator()() { return action_((PtrT)dev_ptr_); } // NOLINT

private:
Action action_;
void* host_ptr_ = nullptr;
void* dev_ptr_ = nullptr;

template <typename T>
static auto choose_flags(const T*) -> unsigned int
{
int dev_id, readonly_supported;
RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
RAFT_CUDA_TRY(cudaDeviceGetAttribute(
&readonly_supported, cudaDevAttrHostRegisterReadOnlySupported, dev_id));
if (readonly_supported) {
return cudaHostRegisterMapped | cudaHostRegisterReadOnly;
} else {
return cudaHostRegisterMapped;
}
}

template <typename T>
static auto choose_flags(T*) -> unsigned int
{
return cudaHostRegisterMapped;
}
};

template <typename T>
struct config {
};
Expand Down
Loading

0 comments on commit e06b156

Please sign in to comment.