From 1a912abc7600cf2a486b53c5a905ae325f621950 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 5 Oct 2022 15:52:53 +0200 Subject: [PATCH 1/5] Output non-normalized distances in IVF-PQ and brute-force KNN --- .../spatial/knn/detail/ivf_flat_search.cuh | 4 +- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 52 ++++++++++++++----- cpp/test/spatial/ann_utils.cuh | 5 +- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh index 770530b77c..e7c75ec16a 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh @@ -1074,9 +1074,9 @@ void search_impl(const handle_t& handle, rmm::device_uvector coarse_distances_dev(n_queries * n_probes, stream, search_mr); // The topk index of cluster(list) and queries rmm::device_uvector coarse_indices_dev(n_queries * n_probes, stream, search_mr); - // The topk distance value of candicate vectors from each cluster(list) + // The topk distance value of candidate vectors from each cluster(list) rmm::device_uvector refined_distances_dev(n_queries * n_probes * k, stream, search_mr); - // The topk index of candicate vectors from each cluster(list) + // The topk index of candidate vectors from each cluster(list) rmm::device_uvector refined_indices_dev(n_queries * n_probes * k, stream, search_mr); size_t float_query_size; diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 73030ea53f..308132155c 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -393,11 +393,27 @@ void postprocess_neighbors(IdxT* neighbors, // [n_queries, topk] topk); } +template +HDI auto undo_normalization(const float& x, bool squared) + -> std::enable_if_t, float> +{ + return x; +}; + +template +HDI auto undo_normalization(const float& x, bool squared) + -> std::enable_if_t, float> +{ + double kMult = utils::config::kDivisor / utils::config::kDivisor; + if (squared) { kMult *= kMult; } + return x * static_cast(kMult); +}; + /** * Post-process the scores depending on the metric type; * translate the element type if necessary. */ -template +template void postprocess_distances(float* out, // [n_queries, topk] const ScoreT* in, // [n_queries, topk] distance::DistanceType metric, @@ -410,16 +426,28 @@ void postprocess_distances(float* out, // [n_queries, topk] case distance::DistanceType::L2Unexpanded: case distance::DistanceType::L2Expanded: { linalg::unaryOp( - out, in, len, [] __device__(ScoreT x) -> float { return float(x); }, stream); + out, + in, + len, + [] __device__(ScoreT x) -> float { return undo_normalization(float(x), true); }, + stream); } break; case distance::DistanceType::L2SqrtUnexpanded: case distance::DistanceType::L2SqrtExpanded: { linalg::unaryOp( - out, in, len, [] __device__(ScoreT x) -> float { return sqrtf(float(x)); }, stream); + out, + in, + len, + [] __device__(ScoreT x) -> float { return undo_normalization(sqrtf(float(x)), false); }, + stream); } break; case distance::DistanceType::InnerProduct: { linalg::unaryOp( - out, in, len, [] __device__(ScoreT x) -> float { return -float(x); }, stream); + out, + in, + len, + [] __device__(ScoreT x) -> float { return undo_normalization(-float(x), true); }, + stream); } break; default: RAFT_FAIL("Unexpected metric."); } @@ -979,7 +1007,7 @@ struct ivfpq_compute_similarity { * 3. split the query batch into smaller chunks, so that the device workspace * is guaranteed to fit into GPU memory. */ -template +template void ivfpq_search_worker(const handle_t& handle, const index& index, uint32_t max_samples, @@ -1123,7 +1151,7 @@ void ivfpq_search_worker(const handle_t& handle, mr); // Postprocessing - postprocess_distances(distances, topk_dists.data(), index.metric(), n_queries, topK, stream); + postprocess_distances(distances, topk_dists.data(), index.metric(), n_queries, topK, stream); postprocess_neighbors(neighbors, manage_local_topk, data_indices, @@ -1140,7 +1168,7 @@ void ivfpq_search_worker(const handle_t& handle, * This structure helps selecting a proper instance of the worker search function, * which contains a few template parameters. */ -template +template struct ivfpq_search { public: using fun_t = void (*)(const handle_t&, @@ -1177,14 +1205,14 @@ struct ivfpq_search { } switch (params.lut_dtype) { - case CUDA_R_32F: return ivfpq_search_worker; - case CUDA_R_16F: return ivfpq_search_worker; + case CUDA_R_32F: return ivfpq_search_worker; + case CUDA_R_16F: return ivfpq_search_worker; case CUDA_R_8U: case CUDA_R_8I: if (signed_metric) { - return ivfpq_search_worker, IdxT>; + return ivfpq_search_worker, IdxT>; } else { - return ivfpq_search_worker, IdxT>; + return ivfpq_search_worker, IdxT>; } default: RAFT_FAIL("Unexpected lut_dtype (%d)", int(params.lut_dtype)); } @@ -1311,7 +1339,7 @@ inline void search(const handle_t& handle, rmm::device_uvector rot_queries(max_queries * index.rot_dim(), stream, mr); rmm::device_uvector clusters_to_probe(max_queries * params.n_probes, stream, mr); - auto search_instance = ivfpq_search::fun(params, index.metric()); + auto search_instance = ivfpq_search::fun(params, index.metric()); for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = min(max_queries, n_queries - offset_q); diff --git a/cpp/test/spatial/ann_utils.cuh b/cpp/test/spatial/ann_utils.cuh index ce92828e8d..ad803094ea 100644 --- a/cpp/test/spatial/ann_utils.cuh +++ b/cpp/test/spatial/ann_utils.cuh @@ -107,7 +107,6 @@ __global__ void naiveDistanceKernel(EvalT* dist, IdxT k, raft::distance::DistanceType type) { - detail::utils::mapping f{}; IdxT midx = threadIdx.x + blockIdx.x * blockDim.x; if (midx >= m) return; for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; @@ -116,8 +115,8 @@ __global__ void naiveDistanceKernel(EvalT* dist, for (IdxT i = 0; i < k; ++i) { IdxT xidx = i + midx * k; IdxT yidx = i + nidx * k; - EvalT xv = f(x[xidx]); - EvalT yv = f(y[yidx]); + EvalT xv = (EvalT)x[xidx]; + EvalT yv = (EvalT)y[yidx]; if (type == raft::distance::DistanceType::InnerProduct) { acc += xv * yv; } else { From 6768e3170e8f956c9a1d8cdcf2aeb44b1f6f4b0c Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 5 Oct 2022 17:12:01 +0200 Subject: [PATCH 2/5] Remove unnecessary function undo_normalization --- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 308132155c..1719c831a5 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -393,22 +393,6 @@ void postprocess_neighbors(IdxT* neighbors, // [n_queries, topk] topk); } -template -HDI auto undo_normalization(const float& x, bool squared) - -> std::enable_if_t, float> -{ - return x; -}; - -template -HDI auto undo_normalization(const float& x, bool squared) - -> std::enable_if_t, float> -{ - double kMult = utils::config::kDivisor / utils::config::kDivisor; - if (squared) { kMult *= kMult; } - return x * static_cast(kMult); -}; - /** * Post-process the scores depending on the metric type; * translate the element type if necessary. @@ -422,6 +406,8 @@ void postprocess_distances(float* out, // [n_queries, topk] rmm::cuda_stream_view stream) { size_t len = size_t(n_queries) * size_t(topk); + double kMult = + std::is_same_v ? 1.0 : utils::config::kDivisor / utils::config::kDivisor; switch (metric) { case distance::DistanceType::L2Unexpanded: case distance::DistanceType::L2Expanded: { @@ -429,7 +415,9 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [] __device__(ScoreT x) -> float { return undo_normalization(float(x), true); }, + [kMult] __device__(ScoreT x) -> float { + return static_cast(kMult * kMult) * float(x); + }, stream); } break; case distance::DistanceType::L2SqrtUnexpanded: @@ -438,7 +426,9 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [] __device__(ScoreT x) -> float { return undo_normalization(sqrtf(float(x)), false); }, + [kMult] __device__(ScoreT x) -> float { + return static_cast(kMult) * sqrtf(float(x)); + }, stream); } break; case distance::DistanceType::InnerProduct: { @@ -446,7 +436,9 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [] __device__(ScoreT x) -> float { return undo_normalization(-float(x), true); }, + [kMult] __device__(ScoreT x) -> float { + return -static_cast(kMult * kMult) * float(x); + }, stream); } break; default: RAFT_FAIL("Unexpected metric."); From 8f862aeb7f9866b5b82ec9e59806d406c65d9693 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 6 Oct 2022 12:00:52 +0200 Subject: [PATCH 3/5] Pass scaling factor instead of type to limit template instantiations --- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 53ae82e4bb..f7f9c17d9a 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -399,17 +399,17 @@ void postprocess_neighbors(IdxT* neighbors, // [n_queries, topk] * Post-process the scores depending on the metric type; * translate the element type if necessary. */ -template +template void postprocess_distances(float* out, // [n_queries, topk] const ScoreT* in, // [n_queries, topk] distance::DistanceType metric, uint32_t n_queries, uint32_t topk, + double scaling_factor, rmm::cuda_stream_view stream) { size_t len = size_t(n_queries) * size_t(topk); - double kMult = - std::is_same_v ? 1.0 : utils::config::kDivisor / utils::config::kDivisor; + // todo(lsugy): pass as arg switch (metric) { case distance::DistanceType::L2Unexpanded: case distance::DistanceType::L2Expanded: { @@ -417,8 +417,8 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [kMult] __device__(ScoreT x) -> float { - return static_cast(kMult * kMult) * float(x); + [scaling_factor] __device__(ScoreT x) -> float { + return static_cast(scaling_factor * scaling_factor) * float(x); }, stream); } break; @@ -428,8 +428,8 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [kMult] __device__(ScoreT x) -> float { - return static_cast(kMult) * sqrtf(float(x)); + [scaling_factor] __device__(ScoreT x) -> float { + return static_cast(scaling_factor) * sqrtf(float(x)); }, stream); } break; @@ -438,8 +438,8 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [kMult] __device__(ScoreT x) -> float { - return -static_cast(kMult * kMult) * float(x); + [scaling_factor] __device__(ScoreT x) -> float { + return -static_cast(scaling_factor * scaling_factor) * float(x); }, stream); } break; @@ -1006,7 +1006,7 @@ struct ivfpq_compute_similarity { * 3. split the query batch into smaller chunks, so that the device workspace * is guaranteed to fit into GPU memory. */ -template +template void ivfpq_search_worker(const handle_t& handle, const index& index, uint32_t max_samples, @@ -1018,6 +1018,7 @@ void ivfpq_search_worker(const handle_t& handle, const float* query, // [n_queries, rot_dim] IdxT* neighbors, // [n_queries, topK] float* distances, // [n_queries, topK] + double scaling_factor, rmm::mr::device_memory_resource* mr) { auto stream = handle.get_stream(); @@ -1145,7 +1146,8 @@ void ivfpq_search_worker(const handle_t& handle, mr); // Postprocessing - postprocess_distances(distances, topk_dists.data(), index.metric(), n_queries, topK, stream); + postprocess_distances( + distances, topk_dists.data(), index.metric(), n_queries, topK, scaling_factor, stream); postprocess_neighbors(neighbors, manage_local_topk, data_indices, @@ -1162,7 +1164,7 @@ void ivfpq_search_worker(const handle_t& handle, * This structure helps selecting a proper instance of the worker search function, * which contains a few template parameters. */ -template +template struct ivfpq_search { public: using fun_t = void (*)(const handle_t&, @@ -1176,6 +1178,7 @@ struct ivfpq_search { const float*, IdxT*, float*, + double, rmm::mr::device_memory_resource*); /** @@ -1198,14 +1201,14 @@ struct ivfpq_search { } switch (params.lut_dtype) { - case CUDA_R_32F: return ivfpq_search_worker; - case CUDA_R_16F: return ivfpq_search_worker; + case CUDA_R_32F: return ivfpq_search_worker; + case CUDA_R_16F: return ivfpq_search_worker; case CUDA_R_8U: case CUDA_R_8I: if (signed_metric) { - return ivfpq_search_worker, IdxT>; + return ivfpq_search_worker, IdxT>; } else { - return ivfpq_search_worker, IdxT>; + return ivfpq_search_worker, IdxT>; } default: RAFT_FAIL("Unexpected lut_dtype (%d)", int(params.lut_dtype)); } @@ -1332,7 +1335,7 @@ inline void search(const handle_t& handle, rmm::device_uvector rot_queries(max_queries * index.rot_dim(), stream, mr); rmm::device_uvector clusters_to_probe(max_queries * params.n_probes, stream, mr); - auto search_instance = ivfpq_search::fun(params, index.metric()); + auto search_instance = ivfpq_search::fun(params, index.metric()); for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = min(max_queries, n_queries - offset_q); @@ -1386,6 +1389,9 @@ inline void search(const handle_t& handle, rot_queries.data() + uint64_t(index.rot_dim()) * offset_b, neighbors + uint64_t(k) * (offset_q + offset_b), distances + uint64_t(k) * (offset_q + offset_b), + std::is_same_v + ? 1.0 + : utils::config::kDivisor / utils::config::kDivisor, mr); } } From 26cd1d2142d4bcf404d26d369c2d4e5e4a618e51 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 6 Oct 2022 12:41:08 +0200 Subject: [PATCH 4/5] Remove comment --- cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index f7f9c17d9a..390b18f141 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -409,7 +409,6 @@ void postprocess_distances(float* out, // [n_queries, topk] rmm::cuda_stream_view stream) { size_t len = size_t(n_queries) * size_t(topk); - // todo(lsugy): pass as arg switch (metric) { case distance::DistanceType::L2Unexpanded: case distance::DistanceType::L2Expanded: { From 8b4d357a8db181c8103ca11fe6e76e104c8f4796 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 6 Oct 2022 13:46:38 +0200 Subject: [PATCH 5/5] Nitpicks --- .../raft/spatial/knn/detail/ivf_pq_search.cuh | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh index 390b18f141..6a2fccf957 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh @@ -405,7 +405,7 @@ void postprocess_distances(float* out, // [n_queries, topk] distance::DistanceType metric, uint32_t n_queries, uint32_t topk, - double scaling_factor, + float scaling_factor, rmm::cuda_stream_view stream) { size_t len = size_t(n_queries) * size_t(topk); @@ -417,7 +417,7 @@ void postprocess_distances(float* out, // [n_queries, topk] in, len, [scaling_factor] __device__(ScoreT x) -> float { - return static_cast(scaling_factor * scaling_factor) * float(x); + return scaling_factor * scaling_factor * float(x); }, stream); } break; @@ -427,9 +427,7 @@ void postprocess_distances(float* out, // [n_queries, topk] out, in, len, - [scaling_factor] __device__(ScoreT x) -> float { - return static_cast(scaling_factor) * sqrtf(float(x)); - }, + [scaling_factor] __device__(ScoreT x) -> float { return scaling_factor * sqrtf(float(x)); }, stream); } break; case distance::DistanceType::InnerProduct: { @@ -438,7 +436,7 @@ void postprocess_distances(float* out, // [n_queries, topk] in, len, [scaling_factor] __device__(ScoreT x) -> float { - return -static_cast(scaling_factor * scaling_factor) * float(x); + return -scaling_factor * scaling_factor * float(x); }, stream); } break; @@ -1017,7 +1015,7 @@ void ivfpq_search_worker(const handle_t& handle, const float* query, // [n_queries, rot_dim] IdxT* neighbors, // [n_queries, topK] float* distances, // [n_queries, topK] - double scaling_factor, + float scaling_factor, rmm::mr::device_memory_resource* mr) { auto stream = handle.get_stream(); @@ -1177,7 +1175,7 @@ struct ivfpq_search { const float*, IdxT*, float*, - double, + float, rmm::mr::device_memory_resource*); /** @@ -1388,9 +1386,7 @@ inline void search(const handle_t& handle, rot_queries.data() + uint64_t(index.rot_dim()) * offset_b, neighbors + uint64_t(k) * (offset_q + offset_b), distances + uint64_t(k) * (offset_q + offset_b), - std::is_same_v - ? 1.0 - : utils::config::kDivisor / utils::config::kDivisor, + utils::config::kDivisor / utils::config::kDivisor, mr); } }