Skip to content

Commit

Permalink
Scaling workspace resources (#181)
Browse files Browse the repository at this point in the history
Use raft's large workspace resource for large temporary allocations during ANN index build.
This is the port of rapidsai/raft#2194, which didn't make into raft before the algorithms were ported to cuVS.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: #181
  • Loading branch information
achirkin authored Jun 12, 2024
1 parent 4847b5b commit 5668ef0
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 85 deletions.
59 changes: 41 additions & 18 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

// TODO: Fixme- this needs to be migrated
#include "../../ivf_pq/ivf_pq_build.cuh"
#include "../../ivf_pq/ivf_pq_search.cuh"
#include "../../nn_descent.cuh"

// TODO: This shouldn't be calling spatial/knn APIs
Expand Down Expand Up @@ -162,42 +163,64 @@ void build_knn_graph(
// search top (k + 1) neighbors
//

const auto top_k = node_degree + 1;
uint32_t gpu_top_k = node_degree * pq.refinement_rate;
gpu_top_k = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
const auto num_queries = dataset.extent(0);
const auto max_batch_size = 1024;
const auto top_k = node_degree + 1;
uint32_t gpu_top_k = node_degree * pq.refinement_rate;
gpu_top_k = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
const auto num_queries = dataset.extent(0);

// Use the same maximum batch size as the ivf_pq::search to avoid allocating more than needed.
using cuvs::neighbors::ivf_pq::detail::kMaxQueries;
// Heuristic: the build_knn_graph code should use only a fraction of the workspace memory; the
// rest should be used by the ivf_pq::search. Here we say that the workspace size should be a good
// multiple of what is required for the I/O batching below.
constexpr size_t kMinWorkspaceRatio = 5;
auto desired_workspace_size = kMaxQueries * kMinWorkspaceRatio *
(sizeof(DataT) * dataset.extent(1) // queries (dataset batch)
+ sizeof(float) * gpu_top_k // distances
+ sizeof(int64_t) * gpu_top_k // neighbors
+ sizeof(float) * top_k // refined_distances
+ sizeof(int64_t) * top_k // refined_neighbors
);

// If the workspace is smaller than desired, put the I/O buffers into the large workspace.
rmm::device_async_resource_ref workspace_mr =
desired_workspace_size <= raft::resource::get_workspace_free_bytes(res)
? raft::resource::get_workspace_resource(res)
: raft::resource::get_large_workspace_resource(res);

RAFT_LOG_DEBUG(
"IVF-PQ search node_degree: %d, top_k: %d, gpu_top_k: %d, max_batch_size:: %d, n_probes: %u",
node_degree,
top_k,
gpu_top_k,
max_batch_size,
kMaxQueries,
pq.search_params.n_probes);

auto distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, gpu_top_k);
auto neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, gpu_top_k);
auto refined_distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, top_k);
auto refined_neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, top_k);
auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, gpu_top_k);
auto queries_host = raft::make_host_matrix<DataT, int64_t>(max_batch_size, dataset.extent(1));
auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, top_k);
auto refined_distances_host = raft::make_host_matrix<float, int64_t>(max_batch_size, top_k);
auto distances = raft::make_device_mdarray<float>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, gpu_top_k));
auto neighbors = raft::make_device_mdarray<int64_t>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, gpu_top_k));
auto refined_distances = raft::make_device_mdarray<float>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, top_k));
auto refined_neighbors = raft::make_device_mdarray<int64_t>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, top_k));
auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(kMaxQueries, gpu_top_k);
auto queries_host = raft::make_host_matrix<DataT, int64_t>(kMaxQueries, dataset.extent(1));
auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(kMaxQueries, top_k);
auto refined_distances_host = raft::make_host_matrix<float, int64_t>(kMaxQueries, top_k);

// TODO(tfeher): batched search with multiple GPUs
std::size_t num_self_included = 0;
bool first = true;
const auto start_clock = std::chrono::system_clock::now();

rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res);

cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
dataset.data_handle(),
dataset.extent(0),
dataset.extent(1),
(int64_t)max_batch_size,
static_cast<int64_t>(kMaxQueries),
raft::resource::get_cuda_stream(res),
device_memory);
workspace_mr);

size_t next_report_offset = 0;
size_t d_report_offset = dataset.extent(0) / 100; // Report progress in 1% steps.
Expand Down
30 changes: 18 additions & 12 deletions cpp/src/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@
#include <memory>
#include <random>

namespace cuvs::neighbors::cagra::detail {
namespace graph {
namespace cuvs::neighbors::cagra::detail::graph {

// unnamed namespace to avoid multiple definition error
namespace {
Expand Down Expand Up @@ -251,15 +250,19 @@ void sort_knn_graph(
const uint32_t input_graph_degree = knn_graph.extent(1);
IdxT* const input_graph_ptr = knn_graph.data_handle();

auto d_input_graph = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);

auto d_input_graph = raft::make_device_mdarray<IdxT>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

//
// Sorting kNN graph
//
const double time_sort_start = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");

auto d_dataset = raft::make_device_matrix<DataT, int64_t>(res, dataset_size, dataset_dim);
auto d_dataset = raft::make_device_mdarray<DataT>(
res, large_tmp_mr, raft::make_extents<int64_t>(dataset_size, dataset_dim));
raft::copy(d_dataset.data_handle(),
dataset_ptr,
dataset_size * dataset_dim,
Expand Down Expand Up @@ -332,6 +335,7 @@ void optimize(
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);

RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
Expand All @@ -347,15 +351,16 @@ void optimize(
//
// Prune kNN graph
//
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
Expand Down Expand Up @@ -475,14 +480,16 @@ void optimize(
graph_size * output_graph_degree * sizeof(IdxT),
raft::resource::get_cuda_stream(res)));

auto d_rev_graph_count = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
auto d_dest_nodes = raft::make_device_vector<IdxT, int64_t>(res, graph_size);
auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
auto d_dest_nodes =
raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));

for (uint64_t k = 0; k < output_graph_degree; k++) {
#pragma omp parallel for
Expand Down Expand Up @@ -578,5 +585,4 @@ void optimize(
}
}

} // namespace graph
} // namespace cuvs::neighbors::cagra::detail
} // namespace cuvs::neighbors::cagra::detail::graph
7 changes: 5 additions & 2 deletions cpp/src/neighbors/detail/cagra/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,11 @@ class device_matrix_view_from_host {
device_ptr = reinterpret_cast<T*>(attr.devicePointer);
if (device_ptr == NULL) {
// allocate memory and copy over
device_mem_.emplace(
raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
// NB: We use the temporary "large" workspace resource here; this structure is supposed to
// live on stack and not returned to a user.
// The user may opt to set this resource to managed memory to allow large allocations.
device_mem_.emplace(raft::make_device_mdarray<T, IdxT>(
res, raft::resource::get_large_workspace_resource(res), host_view.extents()));
raft::copy(device_mem_->data_handle(),
host_view.data_handle(),
host_view.extent(0) * host_view.extent(1),
Expand Down
13 changes: 9 additions & 4 deletions cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <raft/core/mdarray.hpp>
#include <raft/core/operators.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/add.cuh>
#include <raft/linalg/map.cuh>
Expand Down Expand Up @@ -184,7 +185,8 @@ void extend(raft::resources const& handle,
RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
"You must pass data indices when the index is non-empty.");

auto new_labels = raft::make_device_vector<LabelT, IdxT>(handle, n_rows);
auto new_labels = raft::make_device_mdarray<LabelT>(
handle, raft::resource::get_large_workspace_resource(handle), raft::make_extents<IdxT>(n_rows));
cuvs::cluster::kmeans::balanced_params kmeans_params;
kmeans_params.metric = index->metric();
auto orig_centroids_view =
Expand Down Expand Up @@ -215,7 +217,8 @@ void extend(raft::resources const& handle,
}

auto* list_sizes_ptr = index->list_sizes().data_handle();
auto old_list_sizes_dev = raft::make_device_vector<uint32_t, IdxT>(handle, n_lists);
auto old_list_sizes_dev = raft::make_device_mdarray<uint32_t>(
handle, raft::resource::get_workspace_resource(handle), raft::make_extents<IdxT>(n_lists));
raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);

// Calculate the centers and sizes on the new data, starting from the original values
Expand Down Expand Up @@ -371,7 +374,8 @@ inline auto build(raft::resources const& handle,
auto trainset_ratio = std::max<size_t>(
1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
auto n_rows_train = n_rows / trainset_ratio;
rmm::device_uvector<T> trainset(n_rows_train * index.dim(), stream);
rmm::device_uvector<T> trainset(
n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle));
// TODO: a proper sampling
RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
sizeof(T) * index.dim(),
Expand Down Expand Up @@ -431,7 +435,8 @@ inline void fill_refinement_index(raft::resources const& handle,
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
"ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));

rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
rmm::device_uvector<LabelT> new_labels(
n_queries * n_candidates, stream, raft::resource::get_workspace_resource(handle));
auto new_labels_view =
raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
raft::linalg::map_offset(
Expand Down
23 changes: 10 additions & 13 deletions cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -280,17 +280,15 @@ void search_impl(raft::resources const& handle,
template <typename T,
typename IdxT,
typename IvfSampleFilterT = cuvs::neighbors::filtering::none_ivf_sample_filter>
inline void search_with_filtering(
raft::resources const& handle,
const search_params& params,
const index<T, IdxT>& index,
const T* queries,
uint32_t n_queries,
uint32_t k,
IdxT* neighbors,
float* distances,
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
IvfSampleFilterT sample_filter = IvfSampleFilterT())
inline void search_with_filtering(raft::resources const& handle,
const search_params& params,
const index<T, IdxT>& index,
const T* queries,
uint32_t n_queries,
uint32_t k,
IdxT* neighbors,
float* distances,
IvfSampleFilterT sample_filter = IvfSampleFilterT())
{
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
"ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
Expand Down Expand Up @@ -335,7 +333,7 @@ inline void search_with_filtering(
cuvs::distance::is_min_close(index.metric()),
neighbors + offset_q * k,
distances + offset_q * k,
mr,
raft::resource::get_workspace_resource(handle),
sample_filter);
}
}
Expand Down Expand Up @@ -367,7 +365,6 @@ void search_with_filtering(raft::resources const& handle,
static_cast<std::uint32_t>(neighbors.extent(1)),
neighbors.data_handle(),
distances.data_handle(),
raft::resource::get_workspace_resource(handle),
sample_filter);
}

Expand Down
Loading

0 comments on commit 5668ef0

Please sign in to comment.