Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scaling workspace resources #181

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,26 @@ void build_knn_graph(
// search top (k + 1) neighbors
//

const auto top_k = node_degree + 1;
uint32_t gpu_top_k = node_degree * pq.refinement_rate;
gpu_top_k = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
const auto num_queries = dataset.extent(0);
const auto max_batch_size = 1024;
const auto top_k = node_degree + 1;
uint32_t gpu_top_k = node_degree * pq.refinement_rate;
gpu_top_k = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
const auto num_queries = dataset.extent(0);
rmm::device_async_resource_ref workspace_mr = raft::resource::get_workspace_resource(res);

constexpr size_t kDefaultBatchSize = 1024;
constexpr size_t kMaxBatchSize = 4096; // No more perf beyond this
constexpr size_t kMinBatchSize = 128; // Too slow if smaller
// Heuristic: how much of the workspace we can spare for the queries.
// The rest is going to be used by ivf_pq::search
const auto workspace_queries_bytes = raft::resource::get_workspace_free_bytes(res) / 5;
auto max_batch_size =
std::min<size_t>(workspace_queries_bytes / sizeof(DataT) / dataset.extent(1), kMaxBatchSize);
// Heuristic: if the workspace is too small for a decent batch size, switch to use the large
// resource with a default batch size.
if (max_batch_size < kMinBatchSize) {
tfeher marked this conversation as resolved.
Show resolved Hide resolved
max_batch_size = kDefaultBatchSize;
workspace_mr = raft::resource::get_large_workspace_resource(res);
}
tfeher marked this conversation as resolved.
Show resolved Hide resolved
RAFT_LOG_DEBUG(
"IVF-PQ search node_degree: %d, top_k: %d, gpu_top_k: %d, max_batch_size:: %d, n_probes: %u",
node_degree,
Expand All @@ -175,12 +190,17 @@ void build_knn_graph(
max_batch_size,
pq.search_params.n_probes);

auto distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, gpu_top_k);
auto neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, gpu_top_k);
auto refined_distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, top_k);
auto refined_neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, top_k);
auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, gpu_top_k);
auto queries_host = raft::make_host_matrix<DataT, int64_t>(max_batch_size, dataset.extent(1));
rmm::device_async_resource_ref large_mr = raft::resource::get_large_workspace_resource(res);
auto distances = raft::make_device_mdarray<float>(
res, large_mr, raft::make_extents<int64_t>(max_batch_size, gpu_top_k));
auto neighbors = raft::make_device_mdarray<int64_t>(
res, large_mr, raft::make_extents<int64_t>(max_batch_size, gpu_top_k));
auto refined_distances = raft::make_device_mdarray<float>(
res, large_mr, raft::make_extents<int64_t>(max_batch_size, top_k));
auto refined_neighbors = raft::make_device_mdarray<int64_t>(
res, large_mr, raft::make_extents<int64_t>(max_batch_size, top_k));
auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, gpu_top_k);
auto queries_host = raft::make_host_matrix<DataT, int64_t>(max_batch_size, dataset.extent(1));
auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, top_k);
auto refined_distances_host = raft::make_host_matrix<float, int64_t>(max_batch_size, top_k);

Expand All @@ -189,15 +209,13 @@ void build_knn_graph(
bool first = true;
const auto start_clock = std::chrono::system_clock::now();

rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res);

cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
dataset.data_handle(),
dataset.extent(0),
dataset.extent(1),
(int64_t)max_batch_size,
raft::resource::get_cuda_stream(res),
device_memory);
workspace_mr);

size_t next_report_offset = 0;
size_t d_report_offset = dataset.extent(0) / 100; // Report progress in 1% steps.
Expand Down
30 changes: 18 additions & 12 deletions cpp/src/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@
#include <memory>
#include <random>

namespace cuvs::neighbors::cagra::detail {
namespace graph {
namespace cuvs::neighbors::cagra::detail::graph {

// unnamed namespace to avoid multiple definition error
namespace {
Expand Down Expand Up @@ -251,15 +250,19 @@ void sort_knn_graph(
const uint32_t input_graph_degree = knn_graph.extent(1);
IdxT* const input_graph_ptr = knn_graph.data_handle();

auto d_input_graph = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);

auto d_input_graph = raft::make_device_mdarray<IdxT>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

//
// Sorting kNN graph
//
const double time_sort_start = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");

auto d_dataset = raft::make_device_matrix<DataT, int64_t>(res, dataset_size, dataset_dim);
auto d_dataset = raft::make_device_mdarray<DataT>(
res, large_tmp_mr, raft::make_extents<int64_t>(dataset_size, dataset_dim));
raft::copy(d_dataset.data_handle(),
dataset_ptr,
dataset_size * dataset_dim,
Expand Down Expand Up @@ -332,6 +335,7 @@ void optimize(
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);

RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
Expand All @@ -347,15 +351,16 @@ void optimize(
//
// Prune kNN graph
//
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
Expand Down Expand Up @@ -475,14 +480,16 @@ void optimize(
graph_size * output_graph_degree * sizeof(IdxT),
raft::resource::get_cuda_stream(res)));

auto d_rev_graph_count = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
auto d_dest_nodes = raft::make_device_vector<IdxT, int64_t>(res, graph_size);
auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
auto d_dest_nodes =
raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));

for (uint64_t k = 0; k < output_graph_degree; k++) {
#pragma omp parallel for
Expand Down Expand Up @@ -578,5 +585,4 @@ void optimize(
}
}

} // namespace graph
} // namespace cuvs::neighbors::cagra::detail
} // namespace cuvs::neighbors::cagra::detail::graph
7 changes: 5 additions & 2 deletions cpp/src/neighbors/detail/cagra/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,11 @@ class device_matrix_view_from_host {
device_ptr = reinterpret_cast<T*>(attr.devicePointer);
if (device_ptr == NULL) {
// allocate memory and copy over
device_mem_.emplace(
raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
// NB: We use the temporary "large" workspace resource here; this structure is supposed to
tfeher marked this conversation as resolved.
Show resolved Hide resolved
// live on stack and not returned to a user.
// The user may opt to set this resource to managed memory to allow large allocations.
device_mem_.emplace(raft::make_device_mdarray<T, IdxT>(
res, raft::resource::get_large_workspace_resource(res), host_view.extents()));
raft::copy(device_mem_->data_handle(),
host_view.data_handle(),
host_view.extent(0) * host_view.extent(1),
Expand Down
13 changes: 9 additions & 4 deletions cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <raft/core/mdarray.hpp>
#include <raft/core/operators.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/add.cuh>
#include <raft/linalg/map.cuh>
Expand Down Expand Up @@ -184,7 +185,8 @@ void extend(raft::resources const& handle,
RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
"You must pass data indices when the index is non-empty.");

auto new_labels = raft::make_device_vector<LabelT, IdxT>(handle, n_rows);
auto new_labels = raft::make_device_mdarray<LabelT>(
handle, raft::resource::get_large_workspace_resource(handle), raft::make_extents<IdxT>(n_rows));
cuvs::cluster::kmeans::balanced_params kmeans_params;
kmeans_params.metric = index->metric();
auto orig_centroids_view =
Expand Down Expand Up @@ -215,7 +217,8 @@ void extend(raft::resources const& handle,
}

auto* list_sizes_ptr = index->list_sizes().data_handle();
auto old_list_sizes_dev = raft::make_device_vector<uint32_t, IdxT>(handle, n_lists);
auto old_list_sizes_dev = raft::make_device_mdarray<uint32_t>(
handle, raft::resource::get_workspace_resource(handle), raft::make_extents<IdxT>(n_lists));
raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);

// Calculate the centers and sizes on the new data, starting from the original values
Expand Down Expand Up @@ -371,7 +374,8 @@ inline auto build(raft::resources const& handle,
auto trainset_ratio = std::max<size_t>(
1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
auto n_rows_train = n_rows / trainset_ratio;
rmm::device_uvector<T> trainset(n_rows_train * index.dim(), stream);
rmm::device_uvector<T> trainset(
n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle));
// TODO: a proper sampling
RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
sizeof(T) * index.dim(),
Expand Down Expand Up @@ -431,7 +435,8 @@ inline void fill_refinement_index(raft::resources const& handle,
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
"ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));

rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
rmm::device_uvector<LabelT> new_labels(
n_queries * n_candidates, stream, raft::resource::get_workspace_resource(handle));
auto new_labels_view =
raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
raft::linalg::map_offset(
Expand Down
23 changes: 10 additions & 13 deletions cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -280,17 +280,15 @@ void search_impl(raft::resources const& handle,
template <typename T,
typename IdxT,
typename IvfSampleFilterT = cuvs::neighbors::filtering::none_ivf_sample_filter>
inline void search_with_filtering(
raft::resources const& handle,
const search_params& params,
const index<T, IdxT>& index,
const T* queries,
uint32_t n_queries,
uint32_t k,
IdxT* neighbors,
float* distances,
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
IvfSampleFilterT sample_filter = IvfSampleFilterT())
inline void search_with_filtering(raft::resources const& handle,
const search_params& params,
const index<T, IdxT>& index,
const T* queries,
uint32_t n_queries,
uint32_t k,
IdxT* neighbors,
float* distances,
IvfSampleFilterT sample_filter = IvfSampleFilterT())
{
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
"ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
Expand Down Expand Up @@ -335,7 +333,7 @@ inline void search_with_filtering(
cuvs::distance::is_min_close(index.metric()),
neighbors + offset_q * k,
distances + offset_q * k,
mr,
raft::resource::get_workspace_resource(handle),
sample_filter);
}
}
Expand Down Expand Up @@ -367,7 +365,6 @@ void search_with_filtering(raft::resources const& handle,
static_cast<std::uint32_t>(neighbors.extent(1)),
neighbors.data_handle(),
distances.data_handle(),
raft::resource::get_workspace_resource(handle),
sample_filter);
}

Expand Down
Loading
Loading