Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ivf-pq performance tweaks #926

Merged
merged 34 commits into from
Nov 17, 2022
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
eceba57
Improvements for ivfpq_compute_similarity_kernel
achirkin Oct 12, 2022
3eb37b9
Remove debugging printf
achirkin Oct 12, 2022
094a8d5
Fix integer overflow for large n_queries * max_samples
achirkin Oct 12, 2022
bb68b03
Fix bad top-k kernel selection
achirkin Oct 13, 2022
33bc185
Limit max batch size to avoid extra large temporary buffers
achirkin Oct 13, 2022
9135a79
Change the layout of pq_centers for better memory utilization and opt…
achirkin Oct 19, 2022
9cc1a92
Fix invalid scores being added when number of samples % 32 != 0
achirkin Oct 19, 2022
507158e
Fix incorrect copying of pq_centers in PER_CLUSTER case
achirkin Oct 19, 2022
e75e77a
Filter topk results to minimize the number of warp-sorts
achirkin Oct 20, 2022
662124a
Implemented warp_sort_distributed
achirkin Oct 22, 2022
17a75b2
Add missing 'ballot' wrapper
achirkin Oct 23, 2022
a30ad2b
Load pq_centers coalesced, but not vectorized
achirkin Oct 24, 2022
910ae33
Changed layout of the pq data
achirkin Oct 27, 2022
57ae845
Overhauled the launch configuration
achirkin Nov 3, 2022
0e4e631
Changed pq_dataset layout once again and promoted pq_bits to the temp…
achirkin Nov 9, 2022
c0e1c80
Manually unroll the compute-score loop with templates (+8%-+16% qps)
achirkin Nov 9, 2022
5e39679
Use OutT instead of float as the compute-score output type to save on…
achirkin Nov 9, 2022
7814339
set default shmem carveout to 1.0 to allow smaller block sizes
achirkin Nov 9, 2022
ad2ad59
Merge remote-tracking branch 'rapidsai/branch-22.12' into enh-ivf-pq-…
achirkin Nov 10, 2022
fe3399c
Fix build error due to merged code
achirkin Nov 10, 2022
acb1bf2
Add the early stop optimization for metrics that allow this (i.e. for…
achirkin Nov 10, 2022
5b51b3e
Address the first review comments
achirkin Nov 10, 2022
d8dcc43
Merge remote-tracking branch 'rapidsai/branch-22.12' into enh-ivf-pq-…
achirkin Nov 11, 2022
a78b52b
Address more review comments
achirkin Nov 11, 2022
f8a0568
Address one more set of comments
achirkin Nov 11, 2022
598a84e
Cosmetic improvements
achirkin Nov 14, 2022
b690d18
Simplify transpose_pq_centers code using mdspans
achirkin Nov 14, 2022
1cb1aa9
Fix incorrect composing of 'code' from the bits on boundaries between…
achirkin Nov 14, 2022
44ae87a
Allow ivf_pq::build inputs be on the host (for large datasets)
achirkin Nov 15, 2022
ead6bff
Update the documentation: ivf_pq::build accepts both host and device …
achirkin Nov 15, 2022
8665705
Merge remote-tracking branch 'rapidsai/branch-22.12' into enh-ivf-pq-…
achirkin Nov 16, 2022
d64799f
Fix an assert that is no longer valid
achirkin Nov 16, 2022
b28364d
Fix incorrect lookup of the DB record when the query result index is …
achirkin Nov 17, 2022
56d6d5b
Add extra checks for index invariants
achirkin Nov 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions cpp/include/raft/neighbors/ivf_pq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ namespace raft::neighbors::ivf_pq {
*
* @param handle
* @param params configure the index building
* @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
* @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
* @param n_rows the number of samples
* @param dim the dimensionality of the data
*
Expand Down Expand Up @@ -91,8 +91,8 @@ inline auto build(
*
* @param handle
* @param orig_index original index
* @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device pointer to a vector of indices [n_rows].
* @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
* If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
* here to imply a continuous range `[0...n_rows)`.
* @param n_rows the number of samples
Expand All @@ -118,8 +118,8 @@ inline auto extend(const handle_t& handle,
*
* @param handle
* @param[inout] index
* @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device pointer to a vector of indices [n_rows].
* @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
* @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
* If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
* here to imply a continuous range `[0...n_rows)`.
* @param n_rows the number of samples
Expand Down
91 changes: 69 additions & 22 deletions cpp/include/raft/neighbors/ivf_pq_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,17 +108,30 @@ struct search_params : ann::search_params {
*/
cudaDataType_t internal_distance_dtype = CUDA_R_32F;
/**
* Thread block size of the distance calculation kernel at search time.
* When zero, an optimal block size is selected using a heuristic.
* Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
*
* Possible values: [0, 256, 512, 1024]
* Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
*
* One wants to increase the carveout to make sure a good GPU occupancy for the main search
* kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
* value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
* configurations, so the provided value is rounded up to the nearest configuration. Refer to the
* NVIDIA tuning guide for the target GPU architecture.
*
* Note, this is a low-level tuning parameter that can have drastic negative effects on the search
* performance if tweaked incorrectly.
*/
uint32_t preferred_thread_block_size = 0;
achirkin marked this conversation as resolved.
Show resolved Hide resolved
double preferred_shmem_carveout = 1.0;
};

static_assert(std::is_aggregate_v<index_params>);
static_assert(std::is_aggregate_v<search_params>);

/** Size of the interleaved group. */
constexpr static uint32_t kIndexGroupSize = 32;
/** Stride of the interleaved group for vectorized loads. */
constexpr static uint32_t kIndexGroupVecLen = 16;

/**
* @brief IVF-PQ index.
*
Expand Down Expand Up @@ -247,12 +260,12 @@ struct index : ann::index {
pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim),
n_nonempty_lists_(n_nonempty_lists),
pq_centers_{make_device_mdarray<float>(handle, make_pq_centers_extents())},
pq_dataset_{make_device_mdarray<uint8_t>(
handle, make_extents<IdxT>(0, this->pq_dim() * this->pq_bits() / 8))},
pq_dataset_{make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(0))},
indices_{make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))},
rotation_matrix_{
make_device_mdarray<float>(handle, make_extents<uint32_t>(this->rot_dim(), this->dim()))},
list_offsets_{make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(this->n_lists() + 1))},
list_sizes_{make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(this->n_lists()))},
centers_{make_device_mdarray<float>(
handle, make_extents<uint32_t>(this->n_lists(), this->dim_ext()))},
centers_rot_{make_device_mdarray<float>(
Expand Down Expand Up @@ -283,35 +296,45 @@ struct index : ann::index {
*/
void allocate(const handle_t& handle, IdxT index_size)
{
pq_dataset_ =
make_device_mdarray<uint8_t>(handle, make_extents<IdxT>(index_size, pq_dataset_.extent(1)));
indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
check_consistency();
}

using pq_centers_extents =
std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent>;
/**
* PQ cluster centers
*
* - codebook_gen::PER_SUBSPACE: [pq_dim , pq_book_size, pq_len]
* - codebook_gen::PER_CLUSTER: [n_lists, pq_book_size, pq_len]
* - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
* - codebook_gen::PER_CLUSTER: [n_lists, pq_len, pq_book_size]
*/
inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
inline auto pq_centers() noexcept -> device_mdspan<float, pq_centers_extents, row_major>
{
return pq_centers_.view();
}
[[nodiscard]] inline auto pq_centers() const noexcept
-> device_mdspan<const float, extent_3d<uint32_t>, row_major>
-> device_mdspan<const float, pq_centers_extents, row_major>
{
return pq_centers_.view();
}

/** PQ-encoded data [size, pq_dim * pq_bits / 8]. */
inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, extent_2d<IdxT>, row_major>
using pq_dataset_extents = std::experimental::
extents<IdxT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
/** PQ-encoded data stored in the interleaved format:
*
* [ ceildiv(size, kIndexGroupSize)
* , ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits)
* , kIndexGroupSize
* , kIndexGroupVecLen
* ].
*/
inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, pq_dataset_extents, row_major>
{
return pq_dataset_.view();
}
[[nodiscard]] inline auto pq_dataset() const noexcept
-> device_mdspan<const uint8_t, extent_2d<IdxT>, row_major>
-> device_mdspan<const uint8_t, pq_dataset_extents, row_major>
{
return pq_dataset_.view();
}
Expand Down Expand Up @@ -352,6 +375,17 @@ struct index : ann::index {
return list_offsets_.view();
}

/** Sizes of the lists [n_lists]. */
inline auto list_sizes() noexcept -> device_mdspan<uint32_t, extent_1d<uint32_t>, row_major>
{
return list_sizes_.view();
}
[[nodiscard]] inline auto list_sizes() const noexcept
-> device_mdspan<const uint32_t, extent_1d<uint32_t>, row_major>
{
return list_sizes_.view();
}

/** Cluster centers corresponding to the lists in the original space [n_lists, dim_ext] */
inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
{
Expand All @@ -374,6 +408,18 @@ struct index : ann::index {
return centers_rot_.view();
}

/** A helper function to determine the extents of an array enough to hold a given amount of data.
*/
auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
{
// how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
return make_extents<IdxT>(raft::div_rounding_up_safe<IdxT>(n_rows, kIndexGroupSize),
raft::div_rounding_up_safe<IdxT>(pq_dim(), pq_chunk),
kIndexGroupSize,
kIndexGroupVecLen);
}

private:
raft::distance::DistanceType metric_;
codebook_gen codebook_kind_;
Expand All @@ -383,11 +429,12 @@ struct index : ann::index {
uint32_t pq_dim_;
uint32_t n_nonempty_lists_;

device_mdarray<float, extent_3d<uint32_t>, row_major> pq_centers_;
device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
device_mdarray<float, pq_centers_extents, row_major> pq_centers_;
device_mdarray<uint8_t, pq_dataset_extents, row_major> pq_dataset_;
device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
device_mdarray<float, extent_2d<uint32_t>, row_major> centers_rot_;

Expand All @@ -404,13 +451,13 @@ struct index : ann::index {
pq_bits() * pq_dim());
}

auto make_pq_centers_extents() -> extent_3d<uint32_t>
auto make_pq_centers_extents() -> pq_centers_extents
{
switch (codebook_kind()) {
case codebook_gen::PER_SUBSPACE:
return make_extents<uint32_t>(pq_dim(), pq_book_size(), pq_len());
return make_extents<uint32_t>(pq_dim(), pq_len(), pq_book_size());
case codebook_gen::PER_CLUSTER:
return make_extents<uint32_t>(n_lists(), pq_book_size(), pq_len());
return make_extents<uint32_t>(n_lists(), pq_len(), pq_book_size());
default: RAFT_FAIL("Unreachable code");
}
}
Expand All @@ -420,7 +467,7 @@ struct index : ann::index {
// If the dimensionality is large enough, we can reduce it to improve performance
if (dim >= 128) { dim /= 2; }
// Round it down to 32 to improve performance.
uint32_t r = raft::round_down_safe<uint32_t>(dim, 32);
auto r = raft::round_down_safe<uint32_t>(dim, 32);
if (r > 0) return r;
// If the dimensionality is really low, round it to the closest power-of-two
r = 1;
Expand Down
57 changes: 55 additions & 2 deletions cpp/include/raft/spatial/knn/detail/ann_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ struct pointer_residency_count<Type, Types...> {
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
switch (attr.type) {
case cudaMemoryTypeUnregistered:
case cudaMemoryTypeHost: return std::make_tuple(on_device, on_host + 1);
case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1);
case cudaMemoryTypeHost:
return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1);
case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
default: return std::make_tuple(on_device, on_host);
Expand All @@ -75,6 +76,58 @@ auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
return pointer_residency::mixed;
}

/** RAII helper to access the host data from gpu when necessary. */
template <typename PtrT, typename Action>
struct with_mapped_memory_t {
with_mapped_memory_t(PtrT ptr, size_t size, Action action) : action_(action)
{
if (ptr == nullptr) { return; }
switch (utils::check_pointer_residency(ptr)) {
case utils::pointer_residency::device_only:
case utils::pointer_residency::host_and_device: {
dev_ptr_ = (void*)ptr; // NOLINT
} break;
default: {
host_ptr_ = (void*)ptr; // NOLINT
RAFT_CUDA_TRY(cudaHostRegister(host_ptr_, size, choose_flags(ptr)));
RAFT_CUDA_TRY(cudaHostGetDevicePointer(&dev_ptr_, host_ptr_, 0));
} break;
}
}

~with_mapped_memory_t()
{
if (host_ptr_ != nullptr) { cudaHostUnregister(host_ptr_); }
}

auto operator()() { return action_((PtrT)dev_ptr_); } // NOLINT

private:
Action action_;
void* host_ptr_ = nullptr;
void* dev_ptr_ = nullptr;

template <typename T>
static auto choose_flags(const T*) -> unsigned int
{
int dev_id, readonly_supported;
RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
RAFT_CUDA_TRY(cudaDeviceGetAttribute(
&readonly_supported, cudaDevAttrHostRegisterReadOnlySupported, dev_id));
if (readonly_supported) {
return cudaHostRegisterMapped | cudaHostRegisterReadOnly;
} else {
return cudaHostRegisterMapped;
}
}

template <typename T>
static auto choose_flags(T*) -> unsigned int
{
return cudaHostRegisterMapped;
}
};

template <typename T>
struct config {
};
Expand Down
Loading