diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh index 5a146c18fe..9e68e21a76 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 73030ea53f..62c9faa7d7 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -262,13 +262,11 @@ __launch_bounds__(BlockDim) __global__ template struct calc_chunk_indices { public: - using kernel_t = void (*)(uint32_t, const IdxT*, const uint32_t*, uint32_t*, uint32_t*); - struct configured { - kernel_t kernel; - uint32_t block_dim; + void* kernel; + dim3 block_dim; + dim3 grid_dim; uint32_t n_probes; - uint32_t n_queries; void operator()(const IdxT* cluster_offsets, const uint32_t* clusters_to_probe, @@ -276,8 +274,9 @@ struct calc_chunk_indices { uint32_t* n_samples, rmm::cuda_stream_view stream) { - kernel<<>>( - n_probes, cluster_offsets, clusters_to_probe, chunk_indices, n_samples); + void* args[] = // NOLINT + {&n_probes, &cluster_offsets, &clusters_to_probe, &chunk_indices, &n_samples}; + RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream)); } }; @@ -293,7 +292,10 @@ struct calc_chunk_indices { if constexpr (BlockDim >= WarpSize * 2) { if (BlockDim >= n_probes * 2) { return try_block_dim<(BlockDim / 2)>(n_probes, n_queries); } } - return {calc_chunk_indices_kernel, BlockDim, n_probes, n_queries}; + return {reinterpret_cast(calc_chunk_indices_kernel), + dim3(BlockDim, 1, 1), + dim3(n_queries, 1, 1), + n_probes}; } }; @@ -830,16 +832,17 @@ struct ivfpq_compute_similarity { }; struct selected { - kernel_t kernel; - uint32_t n_blocks; - uint32_t n_threads; + void* kernel; + dim3 grid_dim; + dim3 block_dim; size_t smem_size; size_t device_lut_size; template - void operator()(rmm::cuda_stream_view stream, Args&&... args) + void operator()(rmm::cuda_stream_view stream, Args... args) { - kernel<<>>(std::forward(args)...); + void* xs[] = {&args...}; // NOLINT + RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, xs, smem_size, stream)); } }; @@ -967,7 +970,11 @@ struct ivfpq_compute_similarity { } uint32_t device_lut_size = use_smem_lut ? 0u : n_blocks * (pq_dim << pq_bits); - return {kernel, n_blocks, n_threads, smem_size, device_lut_size}; + return {reinterpret_cast(kernel), + dim3(n_blocks, 1, 1), + dim3(n_threads, 1, 1), + smem_size, + device_lut_size}; } }; @@ -984,7 +991,6 @@ void ivfpq_search_worker(const handle_t& handle, const index& index, uint32_t max_samples, uint32_t n_probes, - uint32_t max_batch_size, uint32_t topK, uint32_t preferred_thread_block_size, uint32_t n_queries, @@ -994,10 +1000,6 @@ void ivfpq_search_worker(const handle_t& handle, float* distances, // [n_queries, topK] rmm::mr::device_memory_resource* mr) { - RAFT_EXPECTS(n_queries <= max_batch_size, - "number of queries (%u) must be smaller the max batch size (%u)", - n_queries, - max_batch_size); auto stream = handle.get_stream(); auto pq_centers = index.pq_centers().data_handle(); @@ -1006,10 +1008,10 @@ void ivfpq_search_worker(const handle_t& handle, auto cluster_centers = index.centers_rot().data_handle(); auto cluster_offsets = index.list_offsets().data_handle(); - bool manage_local_topk = - topK <= kMaxCapacity // depth is not too large - && n_probes >= 16 // not too few clusters looked up - && max_batch_size * n_probes >= 256 // overall amount of work is not too small + bool manage_local_topk = topK <= kMaxCapacity // depth is not too large + && n_probes >= 16 // not too few clusters looked up + && + n_queries * n_probes >= 256 // overall amount of work is not too small ; auto topk_len = manage_local_topk ? n_probes * topK : max_samples; if (manage_local_topk) { @@ -1021,14 +1023,14 @@ void ivfpq_search_worker(const handle_t& handle, rmm::device_uvector index_list_sorted_buf(0, stream, mr); uint32_t* index_list_sorted = nullptr; - rmm::device_uvector num_samples(max_batch_size, stream, mr); - rmm::device_uvector chunk_index(max_batch_size * n_probes, stream, mr); + rmm::device_uvector num_samples(n_queries, stream, mr); + rmm::device_uvector chunk_index(n_queries * n_probes, stream, mr); // [maxBatchSize, max_samples] or [maxBatchSize, n_probes, topk] - rmm::device_uvector distances_buf(max_batch_size * topk_len, stream, mr); + rmm::device_uvector distances_buf(n_queries * topk_len, stream, mr); rmm::device_uvector neighbors_buf(0, stream, mr); IdxT* neighbors_ptr = nullptr; if (manage_local_topk) { - neighbors_buf.resize(max_batch_size * topk_len, stream); + neighbors_buf.resize(n_queries * topk_len, stream); neighbors_ptr = neighbors_buf.data(); } @@ -1040,9 +1042,9 @@ void ivfpq_search_worker(const handle_t& handle, // The goal is to incrase the L2 cache hit rate to read the vectors // of a cluster by processing the cluster at the same time as much as // possible. - index_list_sorted_buf.resize(max_batch_size * n_probes, stream); - rmm::device_uvector index_list_buf(max_batch_size * n_probes, stream, mr); - rmm::device_uvector cluster_labels_out(max_batch_size * n_probes, stream, mr); + index_list_sorted_buf.resize(n_queries * n_probes, stream); + rmm::device_uvector index_list_buf(n_queries * n_probes, stream, mr); + rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); auto index_list = index_list_buf.data(); index_list_sorted = index_list_sorted_buf.data(); thrust::sequence(handle.get_thrust_policy(), @@ -1150,7 +1152,6 @@ struct ivfpq_search { uint32_t, uint32_t, uint32_t, - uint32_t, const uint32_t*, const float*, IdxT*, @@ -1319,7 +1320,7 @@ inline void search(const handle_t& handle, select_clusters(handle, clusters_to_probe.data(), float_queries.data(), - n_queries, + queries_batch, params.n_probes, index.n_lists(), dim, @@ -1358,7 +1359,6 @@ inline void search(const handle_t& handle, index, max_samples, params.n_probes, - max_batch_size, k, params.preferred_thread_block_size, batch_size, diff --git a/cpp/test/spatial/ann_ivf_pq.cuh b/cpp/test/spatial/ann_ivf_pq.cuh index 655c899f3c..a247f0101f 100644 --- a/cpp/test/spatial/ann_ivf_pq.cuh +++ b/cpp/test/spatial/ann_ivf_pq.cuh @@ -439,6 +439,34 @@ inline auto var_k() -> test_cases_t }); } +/** + * Cases brought up from downstream projects. + */ +inline auto special_cases() -> test_cases_t +{ + test_cases_t xs; + +#define ADD_CASE(f) \ + do { \ + xs.push_back({}); \ + ([](ivf_pq_inputs & x) f)(xs[xs.size() - 1]); \ + } while (0); + + ADD_CASE({ + x.num_db_vecs = 1183514; + x.dim = 100; + x.num_queries = 10000; + x.k = 10; + x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE; + x.index_params.pq_dim = 10; + x.index_params.pq_bits = 8; + x.index_params.n_lists = 1024; + x.search_params.n_probes = 50; + }); + + return xs; +} + /* Test instantiations */ #define TEST_BUILD_SEARCH(type) \ diff --git a/cpp/test/spatial/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/spatial/ann_ivf_pq/test_float_uint32_t.cu index 81c7ef411a..cf2cf1ac54 100644 --- a/cpp/test/spatial/ann_ivf_pq/test_float_uint32_t.cu +++ b/cpp/test/spatial/ann_ivf_pq/test_float_uint32_t.cu @@ -21,6 +21,6 @@ namespace raft::spatial::knn { using f32_f32_u32 = ivf_pq_test; TEST_BUILD_SEARCH(f32_f32_u32) -INSTANTIATE(f32_f32_u32, defaults() + var_n_probes() + var_k()); +INSTANTIATE(f32_f32_u32, defaults() + var_n_probes() + var_k() + special_cases()); } // namespace raft::spatial::knn diff --git a/cpp/test/spatial/ann_utils.cuh b/cpp/test/spatial/ann_utils.cuh index ce92828e8d..7fb040c913 100644 --- a/cpp/test/spatial/ann_utils.cuh +++ b/cpp/test/spatial/ann_utils.cuh @@ -21,7 +21,9 @@ #include #include +#include #include +#include namespace raft::spatial::knn { @@ -99,13 +101,13 @@ inline auto operator<<(std::ostream& os, const print_metric& p) -> std::ostream& } template -__global__ void naiveDistanceKernel(EvalT* dist, - const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - raft::distance::DistanceType type) +__global__ void naive_distance_kernel(EvalT* dist, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + raft::distance::DistanceType type) { detail::utils::mapping f{}; IdxT midx = threadIdx.x + blockIdx.x * blockDim.x; @@ -146,23 +148,26 @@ void naiveBfKnn(EvalT* dist_topk, size_t dim, uint32_t k, raft::distance::DistanceType type, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { + rmm::mr::device_memory_resource* mr = nullptr; + auto pool_guard = raft::get_pool_memory_resource(mr, 1024 * 1024); + dim3 block_dim(16, 32, 1); // maximum reasonable grid size in `y` direction - uint16_t grid_y = + auto grid_y = static_cast(std::min(raft::ceildiv(input_len, block_dim.y), 32768)); // bound the memory used by this function size_t max_batch_size = std::min(n_inputs, raft::ceildiv(size_t(1) << size_t(27), input_len)); - rmm::device_uvector dist(max_batch_size * input_len, stream); + rmm::device_uvector dist(max_batch_size * input_len, stream, mr); for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) { size_t batch_size = std::min(max_batch_size, n_inputs - offset); dim3 grid_dim(raft::ceildiv(batch_size, block_dim.x), grid_y, 1); - naiveDistanceKernel<<>>( + naive_distance_kernel<<>>( dist.data(), x + offset * dim, y, batch_size, input_len, dim, type); detail::select_topk(dist.data(), @@ -173,7 +178,8 @@ void naiveBfKnn(EvalT* dist_topk, dist_topk + offset * k, indices_topk + offset * k, type != raft::distance::DistanceType::InnerProduct, - stream); + stream, + mr); } RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -183,7 +189,7 @@ struct idx_dist_pair { IdxT idx; DistT dist; CompareDist eq_compare; - bool operator==(const idx_dist_pair& a) const + auto operator==(const idx_dist_pair& a) const -> bool { if (idx == a.idx) return true; if (eq_compare(dist, a.dist)) return true;