From 4738210c08117d67e2f3a739685ea523fc0fe2d6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 27 Jul 2023 17:22:28 +0200 Subject: [PATCH] IVF-PQ: Fix illegal memory access with large max_samples --- .../detail/ivf_pq_compute_similarity-ext.cuh | 5 +--- .../detail/ivf_pq_compute_similarity-inl.cuh | 17 +++++------ .../raft/neighbors/detail/ivf_pq_search.cuh | 29 +++++++++++-------- .../ivf_pq_compute_similarity_00_generate.py | 1 - .../ivf_pq_compute_similarity_float_float.cu | 1 - ...f_pq_compute_similarity_float_fp8_false.cu | 1 - ...vf_pq_compute_similarity_float_fp8_true.cu | 1 - .../ivf_pq_compute_similarity_float_half.cu | 1 - ...vf_pq_compute_similarity_half_fp8_false.cu | 1 - ...ivf_pq_compute_similarity_half_fp8_true.cu | 1 - .../ivf_pq_compute_similarity_half_half.cu | 1 - 11 files changed, 25 insertions(+), 34 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh index 0ae2e23b63..1a9788ce4c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh @@ -42,8 +42,7 @@ template -__global__ void compute_similarity_kernel(uint32_t n_rows, - uint32_t dim, +__global__ void compute_similarity_kernel(uint32_t dim, uint32_t n_probes, uint32_t pq_dim, uint32_t n_queries, @@ -82,7 +81,6 @@ struct selected { template void compute_similarity_run(selected s, rmm::cuda_stream_view stream, - uint32_t n_rows, uint32_t dim, uint32_t n_probes, uint32_t pq_dim, @@ -156,7 +154,6 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh index 2fefa900c3..90d993abd5 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh @@ -195,7 +195,6 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, * Setting this to `false` allows to reduce the shared memory usage (and maximum data dim) * at the cost of reducing global memory reading throughput. * - * @param n_rows the number of records in the dataset * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`). * @param n_probes the number of clusters to search for each query * @param pq_dim @@ -251,8 +250,7 @@ template -__global__ void compute_similarity_kernel(uint32_t n_rows, - uint32_t dim, +__global__ void compute_similarity_kernel(uint32_t dim, uint32_t n_probes, uint32_t pq_dim, uint32_t n_queries, @@ -327,14 +325,15 @@ __global__ void compute_similarity_kernel(uint32_t n_rows, uint32_t* out_indices = nullptr; if constexpr (kManageLocalTopK) { // Store topk calculated distances to out_scores (and its indices to out_indices) - out_scores = _out_scores + topk * (probe_ix + (n_probes * query_ix)); - out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix)); + const uint64_t out_offset = probe_ix + n_probes * query_ix; + out_scores = _out_scores + out_offset * topk; + out_indices = _out_indices + out_offset * topk; } else { // Store all calculated distances to out_scores - out_scores = _out_scores + max_samples * query_ix; + out_scores = _out_scores + uint64_t(max_samples) * query_ix; } uint32_t label = cluster_labels[n_probes * query_ix + probe_ix]; - const float* cluster_center = cluster_centers + (dim * label); + const float* cluster_center = cluster_centers + dim * label; const float* pq_center; if (codebook_kind == codebook_gen::PER_SUBSPACE) { pq_center = pq_centers; @@ -602,7 +601,6 @@ template void compute_similarity_run(selected s, rmm::cuda_stream_view stream, - uint32_t n_rows, uint32_t dim, uint32_t n_probes, uint32_t pq_dim, @@ -625,8 +623,7 @@ void compute_similarity_run(selected s, OutT* _out_scores, uint32_t* _out_indices) { - s.kernel<<>>(n_rows, - dim, + s.kernel<<>>(dim, n_probes, pq_dim, n_queries, diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 298083d1e5..b9e911ffe2 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -436,8 +436,10 @@ void ivfpq_search_worker(raft::resources const& handle, auto stream = resource::get_cuda_stream(handle); auto mr = resource::get_workspace_resource(handle); - bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); - auto topk_len = manage_local_topk ? n_probes * topK : max_samples; + bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); + auto topk_len = manage_local_topk ? n_probes * topK : max_samples; + std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes); + std::size_t n_queries_topk_len = std::size_t(n_queries) * std::size_t(topk_len); if (manage_local_topk) { RAFT_LOG_DEBUG("Fused version of the search kernel is selected (manage_local_topk == true)"); } else { @@ -448,13 +450,13 @@ void ivfpq_search_worker(raft::resources const& handle, rmm::device_uvector index_list_sorted_buf(0, stream, mr); uint32_t* index_list_sorted = nullptr; rmm::device_uvector num_samples(n_queries, stream, mr); - rmm::device_uvector chunk_index(n_queries * n_probes, stream, mr); + rmm::device_uvector chunk_index(n_queries_probes, stream, mr); // [maxBatchSize, max_samples] or [maxBatchSize, n_probes, topk] - rmm::device_uvector distances_buf(n_queries * topk_len, stream, mr); + rmm::device_uvector distances_buf(n_queries_topk_len, stream, mr); rmm::device_uvector neighbors_buf(0, stream, mr); uint32_t* neighbors_ptr = nullptr; if (manage_local_topk) { - neighbors_buf.resize(n_queries * topk_len, stream); + neighbors_buf.resize(n_queries_topk_len, stream); neighbors_ptr = neighbors_buf.data(); } rmm::device_uvector neighbors_uint32_buf(0, stream, mr); @@ -479,10 +481,10 @@ void ivfpq_search_worker(raft::resources const& handle, // The goal is to incrase the L2 cache hit rate to read the vectors // of a cluster by processing the cluster at the same time as much as // possible. - index_list_sorted_buf.resize(n_queries * n_probes, stream); + index_list_sorted_buf.resize(n_queries_probes, stream); auto index_list_buf = - make_device_mdarray(handle, mr, make_extents(n_queries * n_probes)); - rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); + make_device_mdarray(handle, mr, make_extents(n_queries_probes)); + rmm::device_uvector cluster_labels_out(n_queries_probes, stream, mr); auto index_list = index_list_buf.data_handle(); index_list_sorted = index_list_sorted_buf.data(); @@ -497,7 +499,7 @@ void ivfpq_search_worker(raft::resources const& handle, cluster_labels_out.data(), index_list, index_list_sorted, - n_queries * n_probes, + n_queries_probes, begin_bit, end_bit, stream); @@ -508,7 +510,7 @@ void ivfpq_search_worker(raft::resources const& handle, cluster_labels_out.data(), index_list, index_list_sorted, - n_queries * n_probes, + n_queries_probes, begin_bit, end_bit, stream); @@ -558,7 +560,6 @@ void ivfpq_search_worker(raft::resources const& handle, } compute_similarity_run(search_instance, stream, - index.size(), index.rot_dim(), n_probes, index.pq_dim(), @@ -706,7 +707,11 @@ inline auto get_max_batch_size(raft::resources const& res, } // Check in the tmp distance buffer is not too big auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t { - return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs; + const uint64_t buffers_fused = 12ull * k * n_probes; + const uint64_t buffers_non_fused = 4ull * max_samples; + const uint64_t other = 32ull * n_probes; + return static_cast(bs) * + (other + (is_local_topk_feasible(k, n_probes, bs) ? buffers_fused : buffers_non_fused)); }; auto max_ws_size = resource::get_workspace_free_bytes(res); if (ws_size(max_batch_size) > max_ws_size) { diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py index 19c3070fd2..5132048d40 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py @@ -57,7 +57,6 @@ template void raft::neighbors::ivf_pq::detail::compute_similarity_run( \\ raft::neighbors::ivf_pq::detail::selected s, \\ rmm::cuda_stream_view stream, \\ - uint32_t n_rows, \\ uint32_t dim, \\ uint32_t n_probes, \\ uint32_t pq_dim, \\ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu index 46642b5595..bfc07b0321 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu index 03d9fb9171..537868b590 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu index 221be5b4fd..59b64b892d 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu index b665a37040..f9e899f8e9 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu index 1acdab4c2e..bf699d7af6 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu index a8ad62c51b..9689ec88e1 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \ diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu index 91a69b0e54..deed61dd3d 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu @@ -47,7 +47,6 @@ raft::neighbors::ivf_pq::detail::compute_similarity_run( \ raft::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ uint32_t dim, \ uint32_t n_probes, \ uint32_t pq_dim, \