From 3707e389536990a119669ce624b439f1c458e27f Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Wed, 13 Dec 2023 15:47:36 +0900 Subject: [PATCH 01/10] improve parallelism of refine_host --- .../raft/neighbors/detail/refine_host-inl.hpp | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index cfedaa38d3..7570040610 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -23,6 +23,8 @@ #include #include +#include // for debug + namespace raft::neighbors::detail { template @@ -41,6 +43,52 @@ template fun_scope( "neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k); +#if 1 + auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads())); + + // For efficiency, each thread should read a certain amount of array elements. + // The number of threads for distance computation is determined taking this into account. + constexpr int n_elements = 512; + size_t max_n_threads = ((n_queries * orig_k * dim) + n_elements - 1) / n_elements; + auto suggested_n_threads_for_distance = std::min(size_t(suggested_n_threads), max_n_threads); + + // The max number of threads for topk computation is the number of queries. + auto suggested_n_threads_for_topk = std::min(size_t(suggested_n_threads), n_queries); + + fprintf( stderr, "# suggested_n_threads, distance:%lu, topk:%lu\r", + suggested_n_threads_for_distance, suggested_n_threads_for_topk ); + + std::vector>> + refined_pairs(n_queries, std::vector>(orig_k)); + + // Compute the refined distance using original dataset vectors +#pragma omp parallel for collapse(2) num_threads(suggested_n_threads_for_distance) + for (size_t i = 0; i < n_queries; i++) { + for (size_t j = 0; j < orig_k; j++) { + const DataT* query = queries.data_handle() + dim * i; + IdxT id = neighbor_candidates(i, j); + const DataT* row = dataset.data_handle() + dim * id; + DistanceT distance = 0.0; + for (size_t k = 0; k < dim; k++) { + distance += DC::template eval(query[k], row[k]); + } + refined_pairs[i][j] = std::make_tuple(distance, id); + } + } + + // Sort the query neighbors by their refined distances +#pragma omp parallel for num_threads(suggested_n_threads_for_topk) + for (size_t i = 0; i < n_queries; i++) { + std::sort(refined_pairs[i].begin(), refined_pairs[i].end()); + // Store first refined_k neighbors + for (size_t j = 0; j < refined_k; j++) { + indices(i, j) = std::get<1>(refined_pairs[i][j]); + if (distances.data_handle() != nullptr) { + distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[i][j])); + } + } + } +#else auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads())); if (size_t(suggested_n_threads) > n_queries) { suggested_n_threads = n_queries; } @@ -70,6 +118,7 @@ template Date: Wed, 13 Dec 2023 15:58:15 +0900 Subject: [PATCH 02/10] Remove debug code --- .../raft/neighbors/detail/refine_host-inl.hpp | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index 48241b25e4..c8134626e5 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -23,8 +23,6 @@ #include #include -#include // for debug - namespace raft::neighbors::detail { template @@ -44,7 +42,6 @@ template fun_scope( "neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k); -#if 1 auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads())); // For efficiency, each thread should read a certain amount of array elements. @@ -56,9 +53,6 @@ template >> refined_pairs(n_queries, std::vector>(orig_k)); @@ -89,41 +83,6 @@ template n_queries) { suggested_n_threads = n_queries; } - -#pragma omp parallel num_threads(suggested_n_threads) - { - std::vector> refined_pairs(orig_k); - for (size_t i = omp_get_thread_num(); i < n_queries; i += omp_get_num_threads()) { - // Compute the refined distance using original dataset vectors - const DataT* query = queries.data_handle() + dim * i; - for (size_t j = 0; j < orig_k; j++) { - IdxT id = neighbor_candidates(i, j); - DistanceT distance = 0.0; - if (static_cast(id) >= n_rows) { - distance = std::numeric_limits::max(); - } else { - const DataT* row = dataset.data_handle() + dim * id; - for (size_t k = 0; k < dim; k++) { - distance += DC::template eval(query[k], row[k]); - } - } - refined_pairs[j] = std::make_tuple(distance, id); - } - // Sort the query neighbors by their refined distances - std::sort(refined_pairs.begin(), refined_pairs.end()); - // Store first refined_k neighbors - for (size_t j = 0; j < refined_k; j++) { - indices(i, j) = std::get<1>(refined_pairs[j]); - if (distances.data_handle() != nullptr) { - distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[j])); - } - } - } - } -#endif } struct distance_comp_l2 { From 5421befc36fb28782f02c710f7a7e4f418b3dc4f Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Wed, 13 Dec 2023 17:03:37 +0900 Subject: [PATCH 03/10] Update cpp/include/raft/neighbors/detail/refine_host-inl.hpp Co-authored-by: Artem M. Chirkin <9253178+achirkin@users.noreply.github.com> --- cpp/include/raft/neighbors/detail/refine_host-inl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index c8134626e5..518c215262 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -47,7 +47,7 @@ template (n_queries * orig_k * dim, n_elements); auto suggested_n_threads_for_distance = std::min(size_t(suggested_n_threads), max_n_threads); // The max number of threads for topk computation is the number of queries. From 4982c98c0a46299f18d3527b91aec223184cb71a Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Wed, 13 Dec 2023 18:21:00 +0900 Subject: [PATCH 04/10] Use fine-grained thread parallelism in refine_host only when the number of queries is small --- .../raft/neighbors/detail/refine_host-inl.hpp | 96 +++++++++++++------ 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index 518c215262..f65171f311 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -44,42 +44,82 @@ template (n_queries * orig_k * dim, n_elements); - auto suggested_n_threads_for_distance = std::min(size_t(suggested_n_threads), max_n_threads); + // If the number of queries is small, separete the distance calculation and + // the top-k calculation into separete loops, and apply finer-grained thread + // parallelism to the distance calculation loop. + if ( n_queries < size_t(suggested_n_threads) ) { + std::vector>> + refined_pairs(n_queries, std::vector>(orig_k)); - // The max number of threads for topk computation is the number of queries. - auto suggested_n_threads_for_topk = std::min(size_t(suggested_n_threads), n_queries); + // For efficiency, each thread should read a certain amount of array + // elements. The number of threads for distance computation is determined + // taking this into account. + constexpr int n_elements = 512; + auto max_n_threads = raft::div_rounding_up_safe(n_queries * orig_k * dim, n_elements); + auto suggested_n_threads_for_distance = std::min(size_t(suggested_n_threads), max_n_threads); - std::vector>> - refined_pairs(n_queries, std::vector>(orig_k)); + // The max number of threads for topk computation is the number of queries. + auto suggested_n_threads_for_topk = std::min(size_t(suggested_n_threads), n_queries); - // Compute the refined distance using original dataset vectors + // Compute the refined distance using original dataset vectors #pragma omp parallel for collapse(2) num_threads(suggested_n_threads_for_distance) - for (size_t i = 0; i < n_queries; i++) { - for (size_t j = 0; j < orig_k; j++) { - const DataT* query = queries.data_handle() + dim * i; - IdxT id = neighbor_candidates(i, j); - const DataT* row = dataset.data_handle() + dim * id; - DistanceT distance = 0.0; - for (size_t k = 0; k < dim; k++) { - distance += DC::template eval(query[k], row[k]); + for (size_t i = 0; i < n_queries; i++) { + for (size_t j = 0; j < orig_k; j++) { + const DataT* query = queries.data_handle() + dim * i; + IdxT id = neighbor_candidates(i, j); + const DataT* row = dataset.data_handle() + dim * id; + DistanceT distance = 0.0; + for (size_t k = 0; k < dim; k++) { + distance += DC::template eval(query[k], row[k]); + } + refined_pairs[i][j] = std::make_tuple(distance, id); } - refined_pairs[i][j] = std::make_tuple(distance, id); } - } - // Sort the query neighbors by their refined distances + // Sort the query neighbors by their refined distances #pragma omp parallel for num_threads(suggested_n_threads_for_topk) - for (size_t i = 0; i < n_queries; i++) { - std::sort(refined_pairs[i].begin(), refined_pairs[i].end()); - // Store first refined_k neighbors - for (size_t j = 0; j < refined_k; j++) { - indices(i, j) = std::get<1>(refined_pairs[i][j]); - if (distances.data_handle() != nullptr) { - distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[i][j])); + for (size_t i = 0; i < n_queries; i++) { + std::sort(refined_pairs[i].begin(), refined_pairs[i].end()); + // Store first refined_k neighbors + for (size_t j = 0; j < refined_k; j++) { + indices(i, j) = std::get<1>(refined_pairs[i][j]); + if (distances.data_handle() != nullptr) { + distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[i][j])); + } + } + } + return; + } + + if (size_t(suggested_n_threads) > n_queries) { suggested_n_threads = n_queries; } + +#pragma omp parallel num_threads(suggested_n_threads) + { + std::vector> refined_pairs(orig_k); + for (size_t i = omp_get_thread_num(); i < n_queries; i += omp_get_num_threads()) { + // Compute the refined distance using original dataset vectors + const DataT* query = queries.data_handle() + dim * i; + for (size_t j = 0; j < orig_k; j++) { + IdxT id = neighbor_candidates(i, j); + DistanceT distance = 0.0; + if (static_cast(id) >= n_rows) { + distance = std::numeric_limits::max(); + } else { + const DataT* row = dataset.data_handle() + dim * id; + for (size_t k = 0; k < dim; k++) { + distance += DC::template eval(query[k], row[k]); + } + } + refined_pairs[j] = std::make_tuple(distance, id); + } + // Sort the query neighbors by their refined distances + std::sort(refined_pairs.begin(), refined_pairs.end()); + // Store first refined_k neighbors + for (size_t j = 0; j < refined_k; j++) { + indices(i, j) = std::get<1>(refined_pairs[j]); + if (distances.data_handle() != nullptr) { + distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[j])); + } } } } From cfba110d3fd7d7a8b9952cc15055b7ced55cf4ff Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Wed, 13 Dec 2023 19:11:23 +0900 Subject: [PATCH 05/10] Added consideration for large dimensionality of dataset --- cpp/include/raft/neighbors/detail/refine_host-inl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index f65171f311..a00856fb96 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -54,7 +54,7 @@ template (n_queries * orig_k * dim, n_elements); auto suggested_n_threads_for_distance = std::min(size_t(suggested_n_threads), max_n_threads); From 080b221de8a15eeb2ea1bafd84beca34d83428dc Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Wed, 13 Dec 2023 19:21:33 +0900 Subject: [PATCH 06/10] Updated distance calculation part to 24.02 based implementation --- .../raft/neighbors/detail/refine_host-inl.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index a00856fb96..5a9e22cc90 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -67,11 +67,15 @@ template (query[k], row[k]); - } + if (static_cast(id) >= n_rows) { + distance = std::numeric_limits::max(); + } else { + const DataT* row = dataset.data_handle() + dim * id; + for (size_t k = 0; k < dim; k++) { + distance += DC::template eval(query[k], row[k]); + } + } refined_pairs[i][j] = std::make_tuple(distance, id); } } From c9bfc9ad4ce09f37337ff96deefa7626510a247a Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Thu, 14 Dec 2023 17:20:33 +0900 Subject: [PATCH 07/10] Satisfy pre-commit --- .../raft/neighbors/detail/refine_host-inl.hpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index 5a9e22cc90..7b7e916dfa 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -47,14 +47,14 @@ template >> - refined_pairs(n_queries, std::vector>(orig_k)); + if (n_queries < size_t(suggested_n_threads)) { + std::vector>> refined_pairs( + n_queries, std::vector>(orig_k)); // For efficiency, each thread should read a certain amount of array // elements. The number of threads for distance computation is determined // taking this into account. - auto n_elements = std::max(size_t(512), dim); + auto n_elements = std::max(size_t(512), dim); auto max_n_threads = raft::div_rounding_up_safe(n_queries * orig_k * dim, n_elements); auto suggested_n_threads_for_distance = std::min(size_t(suggested_n_threads), max_n_threads); @@ -65,9 +65,9 @@ template (id) >= n_rows) { distance = std::numeric_limits::max(); } else { @@ -76,7 +76,7 @@ template (query[k], row[k]); } } - refined_pairs[i][j] = std::make_tuple(distance, id); + refined_pairs[i][j] = std::make_tuple(distance, id); } } @@ -86,10 +86,10 @@ template (refined_pairs[i][j]); - if (distances.data_handle() != nullptr) { - distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[i][j])); - } + indices(i, j) = std::get<1>(refined_pairs[i][j]); + if (distances.data_handle() != nullptr) { + distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[i][j])); + } } } return; From 0e1b9845d1d36468022e334bf8799c4404b381b7 Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Fri, 15 Dec 2023 12:22:15 +0900 Subject: [PATCH 08/10] Satisfy spell checker --- cpp/include/raft/neighbors/detail/refine_host-inl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index 7b7e916dfa..33f4f64fdb 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -44,8 +44,8 @@ template >> refined_pairs( From 0482306dcb9a9bbfb1ef34a9a79222b5d98a483d Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Tue, 9 Jan 2024 12:08:04 +0900 Subject: [PATCH 09/10] Satisfy style-checker --- cpp/include/raft/neighbors/detail/refine_host-inl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index 33f4f64fdb..fb21d5593b 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 04b97d3c9273455bc2af99956d25050f3f362701 Mon Sep 17 00:00:00 2001 From: Akira Naruse Date: Tue, 9 Jan 2024 12:37:18 +0900 Subject: [PATCH 10/10] Added a necessary header file to div_rounding_up_safe() --- cpp/include/raft/neighbors/detail/refine_host-inl.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp index fb21d5593b..a54525f3e6 100644 --- a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include