Skip to content

Commit

Permalink
CAGRA: Separate graph index sorting functionality from prune function (
Browse files Browse the repository at this point in the history
…#1471)

# Changes

This PR separates the graph index sorting functionality from the CAGRA pruning function and creates a new function. (Related issue: #1446)

# Unit test

I have included a new unit test for the sorting function. The test utilizes a separate dataset from the one used in the CAGRA main test to avoid the effect of rounding errors during norm computation between two vectors in the dataset. More details are in the source code.
https://github.com/enp1s0/raft/blob/ea6c449c260895e9125a591a4848eed06f5b72c4/cpp/test/neighbors/ann_cagra.cuh#L93-L96

# Issue
Close #1446

Authors:
  - tsuki (https://github.com/enp1s0)
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: #1471
  • Loading branch information
enp1s0 authored May 10, 2023
1 parent 2014112 commit cc4a76b
Show file tree
Hide file tree
Showing 6 changed files with 306 additions and 70 deletions.
65 changes: 51 additions & 14 deletions cpp/include/raft/neighbors/cagra.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ namespace raft::neighbors::experimental::cagra {
* @code{.cpp}
* using namespace raft::neighbors;
* // use default index parameters
* ivf_pq::index_params build_params;
* ivf_pq::search_params search_params
* cagra::index_params build_params;
* cagra::search_params search_params
* auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
* // create knn graph
* cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
Expand Down Expand Up @@ -84,34 +84,71 @@ void build_knn_graph(raft::device_resources const& res,
detail::build_knn_graph(res, dataset, knn_graph, refine_rate, build_params, search_params);
}

/**
* @brief Sort a KNN graph index.
* Preprocessing step for `cagra::prune`: If a KNN graph is not built using
* `cagra::build_knn_graph`, then it is necessary to call this function before calling
* `cagra::prune`. If the graph is built by `cagra::build_knn_graph`, it is already sorted and you
* do not need to call this function.
*
* Usage example:
* @code{.cpp}
* using namespace raft::neighbors;
* cagra::index_params build_params;
* auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
* // build KNN graph not using `cagra::build_knn_graph`
* // build(knn_graph, dataset, ...);
* // sort graph index
* sort_knn_graph(res, dataset.view(), knn_graph.view());
* // prune graph
* cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view());
* // Construct an index from dataset and pruned knn_graph
* auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset, pruned_graph.view());
* @endcode
*
* @tparam DataT type of the data in the source dataset
* @tparam IdxT type of the indices in the source dataset
*
* @param[in] res raft resources
* @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
* @param[in,out] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
* knn_graph_degree]
*/
template <typename DataT,
typename IdxT = uint32_t,
typename d_accessor =
host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void sort_knn_graph(raft::device_resources const& res,
mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
{
detail::graph::sort_knn_graph(res, dataset, knn_graph);
}

/**
* @brief Prune a KNN graph.
*
* Decrease the number of neighbors for each node.
*
* See [cagra::build_knn_graph](#cagra::build_knn_graph) for usage example
*
* @tparam T data element type
* @tparam IdxT type of the indices in the source dataset
*
* @param[in] res raft resources
* @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
* @param[in] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
* knn_graph_degree]
* @param[out] new_graph a host matrix view of the pruned knn graph [n_rows, graph_degree]
*/
template <class DATA_T,
typename IdxT = uint32_t,
typename d_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
template <typename IdxT = uint32_t,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void prune(raft::device_resources const& res,
mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
{
detail::graph::prune(res, dataset, knn_graph, new_graph);
detail::graph::prune(res, knn_graph, new_graph);
}

/**
Expand All @@ -138,11 +175,11 @@ void prune(raft::device_resources const& res,
* // create and fill the index from a [N, D] dataset
* auto index = cagra::build(res, index_params, dataset);
* // use default search parameters
* ivf_pq::search_params search_params;
* cagra::search_params search_params;
* // search K nearest neighbours
* auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
* auto distances = raft::make_device_matrix<float>(res, n_queries, k);
* ivf_pq::search(res, search_params, index, queries, neighbors, distances);
* cagra::search(res, search_params, index, queries, neighbors, distances);
* @endcode
*
* @tparam T data element type
Expand Down Expand Up @@ -178,7 +215,7 @@ index<T, IdxT> build(raft::device_resources const& res,

auto cagra_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), params.graph_degree);

prune<T, IdxT>(res, dataset, knn_graph.view(), cagra_graph.view());
prune<IdxT>(res, knn_graph.view(), cagra_graph.view());

// Construct an index from dataset and pruned knn graph.
return index<T, IdxT>(res, params.metric, dataset, cagra_graph.view());
Expand Down
131 changes: 84 additions & 47 deletions cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -405,36 +405,24 @@ void shift_array(T* array, uint64_t num)
}
}

/** Input arrays can be both host and device*/
template <class DATA_T,
template <typename DataT,
typename IdxT = uint32_t,
typename d_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
void prune(raft::device_resources const& res,
mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void sort_knn_graph(raft::device_resources const& res,
mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
"dataset size is expected to have the same number of graph index size");
const uint32_t dataset_size = dataset.extent(0);
const uint32_t dataset_dim = dataset.extent(1);
const DataT* dataset_ptr = dataset.data_handle();

RAFT_EXPECTS(
dataset.extent(0) == knn_graph.extent(0) && knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
"output graph cannot have more columns than input graph");
const uint32_t dataset_size = dataset.extent(0);
const uint32_t dataset_dim = dataset.extent(1);
const uint32_t input_graph_degree = knn_graph.extent(1);
const uint32_t output_graph_degree = new_graph.extent(1);
const DATA_T* dataset_ptr = dataset.data_handle();
uint32_t* input_graph_ptr = (uint32_t*)knn_graph.data_handle();
uint32_t* output_graph_ptr = new_graph.data_handle();
float scale = 1.0f / raft::spatial::knn::detail::utils::config<DATA_T>::kDivisor;
const std::size_t graph_size = dataset_size;
size_t array_size;
const uint32_t input_graph_degree = knn_graph.extent(1);
uint32_t* input_graph_ptr = (uint32_t*)knn_graph.data_handle();

// Setup GPUs
int num_gpus = 0;
Expand All @@ -451,46 +439,48 @@ void prune(raft::device_resources const& res,
}
RAFT_CUDA_TRY(cudaSetDevice(0));

uint32_t graph_chunk_size = graph_size;
uint32_t*** d_input_graph_ptr = NULL; // [...][num_gpus][graph_chunk_size, input_graph_degree]
graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
const uint32_t graph_size = knn_graph.extent(0);
uint32_t*** d_input_graph_ptr = NULL; // [...][num_gpus][graph_chunk_size, input_graph_degree]
const uint32_t graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);

uint32_t dataset_chunk_size = dataset_size;
DATA_T*** d_dataset_ptr = NULL; // [num_gpus+1][...][...]
dataset_chunk_size = (dataset_size + num_gpus - 1) / num_gpus;
DataT*** d_dataset_ptr = NULL; // [num_gpus+1][...][...]
const uint32_t dataset_chunk_size = (dataset_size + num_gpus - 1) / num_gpus;
assert(dataset_chunk_size == graph_chunk_size);
d_dataset_ptr = mgpu_alloc<DATA_T>(num_gpus, dataset_chunk_size, dataset_dim);
d_dataset_ptr = mgpu_alloc<DataT>(num_gpus, dataset_chunk_size, dataset_dim);

mgpu_H2D<DATA_T>(
const float scale = 1.0f / raft::spatial::knn::detail::utils::config<DataT>::kDivisor;

mgpu_H2D<DataT>(
d_dataset_ptr, dataset_ptr, num_gpus, dataset_size, dataset_chunk_size, dataset_dim);

//
// Sorting kNN graph
//
double time_sort_start = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");
mgpu_H2D<uint32_t>(
d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
mgpu_H2D<uint32_t>(d_input_graph_ptr,
input_graph_ptr,
num_gpus,
dataset_size,
graph_chunk_size,
input_graph_degree);
void (*kernel_sort)(
DATA_T**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
DataT**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
constexpr int numElementsPerThread = 4;
dim3 threads_sort(1, 1, 1);
if (input_graph_degree <= numElementsPerThread * 32) {
constexpr int blockDim_x = 32;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else if (input_graph_degree <= numElementsPerThread * 64) {
constexpr int blockDim_x = 64;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else if (input_graph_degree <= numElementsPerThread * 128) {
constexpr int blockDim_x = 128;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else if (input_graph_degree <= numElementsPerThread * 256) {
constexpr int blockDim_x = 256;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else {
fprintf(stderr,
Expand All @@ -510,21 +500,68 @@ void prune(raft::device_resources const& res,
dataset_dim,
scale,
d_input_graph_ptr[i_gpu],
graph_size,
dataset_size,
graph_chunk_size,
input_graph_degree,
i_gpu);
}
RAFT_CUDA_TRY(cudaSetDevice(0));
RAFT_CUDA_TRY(cudaDeviceSynchronize());
RAFT_LOG_DEBUG(".");
mgpu_D2H<uint32_t>(
d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
mgpu_D2H<uint32_t>(d_input_graph_ptr,
input_graph_ptr,
num_gpus,
dataset_size,
graph_chunk_size,
input_graph_degree);
RAFT_LOG_DEBUG("\n");
double time_sort_end = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN graph time: %.1lf sec\n", time_sort_end - time_sort_start);

mgpu_free<DATA_T>(d_dataset_ptr, num_gpus);
mgpu_free<DataT>(d_dataset_ptr, num_gpus);
}

/** Input arrays can be both host and device*/
template <typename IdxT = uint32_t,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void prune(raft::device_resources const& res,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));

RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
"output graph cannot have more columns than input graph");
const uint32_t input_graph_degree = knn_graph.extent(1);
const uint32_t output_graph_degree = new_graph.extent(1);
uint32_t* input_graph_ptr = (uint32_t*)knn_graph.data_handle();
uint32_t* output_graph_ptr = new_graph.data_handle();
const std::size_t graph_size = new_graph.extent(0);
size_t array_size;

// Setup GPUs
int num_gpus = 0;

// Setup GPUs
RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus));
RAFT_LOG_DEBUG("# num_gpus: %d\n", num_gpus);
for (int self = 0; self < num_gpus; self++) {
RAFT_CUDA_TRY(cudaSetDevice(self));
for (int peer = 0; peer < num_gpus; peer++) {
if (self == peer) { continue; }
RAFT_CUDA_TRY(cudaDeviceEnablePeerAccess(peer, 0));
}
}
RAFT_CUDA_TRY(cudaSetDevice(0));

uint32_t graph_chunk_size = graph_size;
uint32_t*** d_input_graph_ptr = NULL; // [...][num_gpus][graph_chunk_size, input_graph_degree]
graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);

//
uint8_t* detour_count; // [graph_size, input_graph_degree]
Expand Down
Loading

0 comments on commit cc4a76b

Please sign in to comment.